In [1]:
import numpy as np
import pandas as pd
from sklearn.naive_bayes import MultinomialNB
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import train_test_split
import sklearn.metrics as metrics

In [2]:
data = pd.read_csv('train.csv', names=['id','keyword', 'location', 'text', 'target'], skiprows = 1)
test_data = pd.read_csv('test.csv', names=['id','keyword', 'location', 'text'], skiprows = 1)

print("Training data \n",data.head())
print("Test data \n",test_data.head())
print("Length of Training data \n", len(data))
print("Length of Test data \n", len(test_data))
data.groupby('target').size() #0: fake 4342   1:real 3271

Training data 
    id keyword location                                               text  \
0   1     NaN      NaN  Our Deeds are the Reason of this #earthquake M...   
1   4     NaN      NaN             Forest fire near La Ronge Sask. Canada   
2   5     NaN      NaN  All residents asked to 'shelter in place' are ...   
3   6     NaN      NaN  13,000 people receive #wildfires evacuation or...   
4   7     NaN      NaN  Just got sent this photo from Ruby #Alaska as ...   

   target  
0       1  
1       1  
2       1  
3       1  
4       1  
Test data 
    id keyword location                                               text
0   0     NaN      NaN                 Just happened a terrible car crash
1   2     NaN      NaN  Heard about #earthquake is different cities, s...
2   3     NaN      NaN  there is a forest fire at spot pond, geese are...
3   9     NaN      NaN           Apocalypse lighting. #Spokane #wildfires
4  11     NaN      NaN      Typhoon Soudelor kills 28 in China and 

target
0    4342
1    3271
dtype: int64

In [3]:
data_y = data['target']
data_x = data['text']
test = test_data['text']
print(data_y.head())
print(data_x.head())

0    1
1    1
2    1
3    1
4    1
Name: target, dtype: int64
0    Our Deeds are the Reason of this #earthquake M...
1               Forest fire near La Ronge Sask. Canada
2    All residents asked to 'shelter in place' are ...
3    13,000 people receive #wildfires evacuation or...
4    Just got sent this photo from Ruby #Alaska as ...
Name: text, dtype: object


In [4]:
cv = TfidfVectorizer(min_df = 1 , stop_words = 'english')
x_traincv = cv.fit_transform(data_x)
y_train = data_y
test = cv.transform(test)
print("Test: \n", test)
print("x_traincv: \n", x_traincv)

Test: 
   (0, 18386)	0.5785569375412292
  (0, 10427)	0.3083617017638652
  (0, 8653)	0.5110962656217939
  (0, 4918)	0.38558648171231935
  (0, 3849)	0.4003605915876077
  (1, 17643)	0.3767157106501493
  (1, 16324)	0.4528914383610909
  (1, 8790)	0.36620811397756187
  (1, 6311)	0.3533478550014478
  (1, 5721)	0.41666332373627535
  (1, 4302)	0.46947119579851554
  (2, 17771)	0.3825599751462522
  (2, 17492)	0.3849686070180602
  (2, 16457)	0.355082996910584
  (2, 14575)	0.47133470551232504
  (2, 7563)	0.3270506235727493
  (2, 7436)	0.50024452753824
  (3, 20344)	0.4714105817569697
  (3, 17477)	0.5779957833582845
  (3, 11311)	0.5346584525727388
  (3, 2091)	0.3972823642198616
  (4, 19209)	0.3515887158571113
  (4, 18164)	0.4312148027573012
  (4, 17354)	0.39840731030347953
  (4, 10713)	0.380677754703344
  :	:
  (3259, 8675)	0.33402462992888626
  (3259, 4307)	0.21053575088136406
  (3259, 3250)	0.22578320086395248
  (3259, 1939)	0.15274089361180732
  (3259, 631)	0.33402462992888626
  (3259, 347)	0.3340

In [5]:
clf = MultinomialNB()
clf.fit(x_traincv, y_train)
predictions = clf.predict(test)
print(len(predictions))
print(predictions)
position_of_fakenews = np.where(predictions == 0)[0]
for i in range(0 , len(position_of_fakenews)):
    position_of_fakenews[i] += 1
print('fakenews rows')
print(position_of_fakenews)
print(len(position_of_fakenews))

3263
[0 1 1 ... 1 1 1]
fakenews rows
[   1    7    8 ... 3251 3252 3257]
2200


In [6]:
pred = np.array(predictions)

ids = list(test_data['id'])
print(pred)
print(len(pred))
to_submit = pd.DataFrame({'id': ids, 'target': pred})
to_submit.set_index('id', inplace = True)
print(to_submit)
#to_submit.index += 1

[0 1 1 ... 1 1 1]
3263
       target
id           
0           0
2           1
3           1
9           1
11          1
...       ...
10861       1
10865       1
10868       1
10874       1
10875       1

[3263 rows x 1 columns]


In [7]:
to_submit.target = to_submit#.target.astype(bool)
to_submit = pd.DataFrame(to_submit)
to_submit.target.to_csv('nlp.csv', header = 'False')