In [1]:
import numpy as np
import pandas as pd

from sklearn.feature_extraction.text import CountVectorizer
from sklearn.naive_bayes import MultinomialNB
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import train_test_split
import sklearn.metrics as metrics

In [2]:
data = pd.read_csv('train.csv', names=['id','keyword', 'location', 'text', 'target'], skiprows = 1)
test_data = pd.read_csv('test.csv', names=['id','keyword', 'location', 'text'], skiprows = 1)
print("Training data \n",data.head())
print("Test data \n",test_data.head())
print("Length of Training data \n", len(data))
print("Length of Test data \n", len(test_data))


Training data 
    id keyword location                                               text  \
0   1     NaN      NaN  Our Deeds are the Reason of this #earthquake M...   
1   4     NaN      NaN             Forest fire near La Ronge Sask. Canada   
2   5     NaN      NaN  All residents asked to 'shelter in place' are ...   
3   6     NaN      NaN  13,000 people receive #wildfires evacuation or...   
4   7     NaN      NaN  Just got sent this photo from Ruby #Alaska as ...   

   target  
0       1  
1       1  
2       1  
3       1  
4       1  
Test data 
    id keyword location                                               text
0   0     NaN      NaN                 Just happened a terrible car crash
1   2     NaN      NaN  Heard about #earthquake is different cities, s...
2   3     NaN      NaN  there is a forest fire at spot pond, geese are...
3   9     NaN      NaN           Apocalypse lighting. #Spokane #wildfires
4  11     NaN      NaN      Typhoon Soudelor kills 28 in China and 

In [3]:
data_y = data['target']
data_x = data['text']
test = test_data['text']
print(data_y.head())
print(data_x.head())

0    1
1    1
2    1
3    1
4    1
Name: target, dtype: int64
0    Our Deeds are the Reason of this #earthquake M...
1               Forest fire near La Ronge Sask. Canada
2    All residents asked to 'shelter in place' are ...
3    13,000 people receive #wildfires evacuation or...
4    Just got sent this photo from Ruby #Alaska as ...
Name: text, dtype: object


In [4]:
cv = CountVectorizer(min_df = 1 , stop_words = 'english')
x_traincv = cv.fit_transform(data_x)
y_train = data_y
test = cv.transform(test)
print("Test: \n", test)
print("x_traincv: \n", x_traincv)

Test: 
   (0, 3849)	1
  (0, 4918)	1
  (0, 8653)	1
  (0, 10427)	1
  (0, 18386)	1
  (1, 4302)	1
  (1, 5721)	1
  (1, 6311)	1
  (1, 8790)	1
  (1, 16324)	1
  (1, 17643)	1
  (2, 7436)	1
  (2, 7563)	1
  (2, 14575)	1
  (2, 16457)	1
  (2, 17492)	1
  (2, 17771)	1
  (3, 2091)	1
  (3, 11311)	1
  (3, 17477)	1
  (3, 20344)	1
  (4, 425)	1
  (4, 4203)	1
  (4, 10713)	1
  (4, 17354)	1
  :	:
  (3259, 11521)	1
  (3259, 14651)	1
  (3259, 15883)	1
  (3259, 17735)	1
  (3259, 20530)	1
  (3259, 20896)	1
  (3260, 4177)	1
  (3260, 5563)	1
  (3260, 8349)	1
  (3260, 9192)	1
  (3260, 11352)	1
  (3261, 658)	1
  (3261, 8738)	1
  (3261, 9192)	1
  (3261, 9283)	1
  (3261, 9950)	1
  (3261, 12107)	1
  (3261, 13851)	1
  (3261, 20192)	1
  (3262, 1547)	1
  (3262, 4309)	1
  (3262, 6538)	1
  (3262, 12727)	1
  (3262, 14444)	1
  (3262, 21089)	1
x_traincv: 
   (0, 5429)	1
  (0, 15499)	1
  (0, 6311)	1
  (0, 1844)	1
  (0, 7572)	1
  (1, 7563)	1
  (1, 12979)	1
  (1, 10967)	1
  (1, 16087)	1
  (1, 16431)	1
  (1, 3790)	1
  (2, 15761)	1


In [5]:
clf = MultinomialNB()
clf.fit(x_traincv,y_train)
predictions = clf.predict(test)
print(predictions)
position_of_fakenews = np.where(predictions != 0)[0]
for i in range(0 , len(position_of_fakenews)):
    position_of_fakenews[i] += 1
print('fakenews rows')
print(position_of_fakenews)

[1 1 1 ... 1 1 1]
fakenews rows
[   1    2    3 ... 3261 3262 3263]


In [6]:
pred = np.array(predictions)
ids = np.arange(1114)
for i in range(0,1114):
    ids[i] += 1
print(pred)
print(ids)
print(len(pred))
print(len(ids))
to_submit = pd.DataFrame({'target': pred})
to_submit.index += 1

[1 1 1 ... 1 1 1]
[   1    2    3 ... 1112 1113 1114]
3263
1114


In [7]:
to_submit.target = to_submit.target.astype(bool)
to_submit = pd.DataFrame(to_submit)
to_submit.target.to_csv('nlp.csv' , index_label = 'id', header = 'False')