In [1]:
import numpy as np
import pandas as pd
from pandas import DataFrame
from sklearn.naive_bayes import MultinomialNB
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import train_test_split
import sklearn.metrics as metrics

In [2]:
#loading data

data = pd.read_csv('train.csv', names=['id','keyword', 'location', 'text', 'target'], skiprows = 1)
test_data = pd.read_csv('test.csv', names=['id','keyword', 'location', 'text'], skiprows = 1)

In [3]:
#combining text from keyword, location and text 

columns = ['location', 'text']
data['text'] = data[columns].astype(str).sum(axis=1)
t_data_list = []

for n in data['text']:
    
    row = str(n)
    row = row.replace('nannan', '')
    row = row.replace('nan', '')
    t_data_list.append(row)

data['text'] = t_data_list
####################################################

test_data['text'] = data[columns].astype(str).sum(axis=1)
t_data_list = []

for n in test_data['text']:
    
    row = str(n)
    row = row.replace('nannan', '')
    row = row.replace('nan', '')
    t_data_list.append(row)

test_data['text'] = t_data_list

print("Training data \n",data.head())
print("Test data \n",test_data.head())
print("Length of Training data \n", len(data))
print("Length of Test data \n", len(test_data))
data.groupby('target').size() #0: fake 4342   1:real 3271
data.to_csv('t.csv', header = 'False')

Training data 
    id keyword location                                               text  \
0   1     NaN      NaN  Our Deeds are the Reason of this #earthquake M...   
1   4     NaN      NaN             Forest fire near La Ronge Sask. Canada   
2   5     NaN      NaN  All residents asked to 'shelter in place' are ...   
3   6     NaN      NaN  13,000 people receive #wildfires evacuation or...   
4   7     NaN      NaN  Just got sent this photo from Ruby #Alaska as ...   

   target  
0       1  
1       1  
2       1  
3       1  
4       1  
Test data 
    id keyword location                                               text
0   0     NaN      NaN  Our Deeds are the Reason of this #earthquake M...
1   2     NaN      NaN             Forest fire near La Ronge Sask. Canada
2   3     NaN      NaN  All residents asked to 'shelter in place' are ...
3   9     NaN      NaN  13,000 people receive #wildfires evacuation or...
4  11     NaN      NaN  Just got sent this photo from Ruby #Alaska 

In [4]:
data_y = data['target']
data_x = data['text']
test = test_data['text']
print(data_y.head())
print(data_x.head())

0    1
1    1
2    1
3    1
4    1
Name: target, dtype: int64
0    Our Deeds are the Reason of this #earthquake M...
1               Forest fire near La Ronge Sask. Canada
2    All residents asked to 'shelter in place' are ...
3    13,000 people receive #wildfires evacuation or...
4    Just got sent this photo from Ruby #Alaska as ...
Name: text, dtype: object


In [5]:
cv = TfidfVectorizer(min_df = 1 , stop_words = 'english')
x_traincv = cv.fit_transform(data_x)
y_train = data_y
test = cv.transform(test)
print("Test: \n", test)
print("x_traincv: \n", x_traincv)

Test: 
   (0, 18473)	0.395377275408484
  (0, 8909)	0.5104343743408171
  (0, 7458)	0.3580364205039489
  (0, 6492)	0.5104343743408171
  (0, 2083)	0.44090841528131525
  (1, 19481)	0.5003786477180231
  (1, 19103)	0.5233318167130359
  (1, 15411)	0.32985608580793785
  (1, 12952)	0.3681742877434479
  (1, 8896)	0.3289355861741757
  (1, 4502)	0.35098298416310514
  (2, 19943)	0.5556646105909391
  (2, 18748)	0.2725747407974012
  (2, 17320)	0.45564089712137795
  (2, 16486)	0.25419137155108457
  (2, 16189)	0.26802043127358316
  (2, 15826)	0.315302298762353
  (2, 8183)	0.2443794975291982
  (2, 8060)	0.2098771713285334
  (2, 2627)	0.2640032455729709
  (3, 24174)	0.38126894440386544
  (3, 18494)	0.4469700312435817
  (3, 17053)	0.23746335889266923
  (3, 16486)	0.37686902375111725
  (3, 8060)	0.31116793691140093
  :	:
  (3260, 11447)	0.3644819106813736
  (3260, 9325)	0.3644819106813736
  (3260, 7849)	0.24944448077659792
  (3260, 7160)	0.2938391415011162
  (3261, 23833)	0.22253659430525854
  (3261, 22571

In [6]:
clf = MultinomialNB()
clf.fit(x_traincv, y_train)
predictions = clf.predict(test)
print(len(predictions))
print(predictions)
position_of_fakenews = np.where(predictions == 0)[0]
for i in range(0 , len(position_of_fakenews)):
    position_of_fakenews[i] += 1
print('fakenews rows')
print(position_of_fakenews)
print(len(position_of_fakenews))

3263
[1 1 1 ... 1 0 0]
fakenews rows
[   5   12   13 ... 3260 3262 3263]
2179


In [7]:
pred = np.array(predictions)

ids = list(test_data['id'])
print(pred)
print(len(pred))
to_submit = pd.DataFrame({'id': ids, 'target': pred})
to_submit.set_index('id', inplace = True)
print(to_submit)
#to_submit.index += 1

[1 1 1 ... 1 0 0]
3263
       target
id           
0           1
2           1
3           1
9           1
11          0
...       ...
10861       0
10865       0
10868       1
10874       0
10875       0

[3263 rows x 1 columns]


In [8]:
to_submit.target = to_submit#.target.astype(bool)
to_submit = pd.DataFrame(to_submit)
to_submit.target.to_csv('nlp_locationtext.csv', header = 'False')