In [80]:
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt

#text
import re
import string
import nltk
from nltk.corpus import stopwords

# XGBoost
import xgboost as xgb
from xgboost import XGBClassifier

# sklearn 
from sklearn import model_selection
from sklearn.feature_extraction.text import CountVectorizer,TfidfVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.naive_bayes import MultinomialNB
from sklearn.metrics import f1_score
from sklearn import preprocessing, decomposition, model_selection, metrics, pipeline
from sklearn.model_selection import GridSearchCV,StratifiedKFold,RandomizedSearchCV

In [81]:
train = pd.read_csv('train.csv')
test = pd.read_csv('test.csv')
raw_train = train.copy()
raw_test = test.copy()

In [82]:
all_data = pd.concat([train,test],ignore_index=True)

# Text Preprocessing

## Cleaning Text

In [83]:
def cleaning_text(text):
    text = text.lower()
    
    text = re.sub('\[.*?\]', '', text)
    text = re.sub('https?://\S+|www\.\S+', '', text)
    text = re.sub('<.*?>+', '', text)
    text = re.sub('[%s]' % re.escape(string.punctuation), '', text)
    text = re.sub('\n', '', text)
    text = re.sub('\w*\d\w*', '', text)
    
    return text

In [84]:
all_data['text']

0        Our Deeds are the Reason of this #earthquake M...
1                   Forest fire near La Ronge Sask. Canada
2        All residents asked to 'shelter in place' are ...
3        13,000 people receive #wildfires evacuation or...
4        Just got sent this photo from Ruby #Alaska as ...
                               ...                        
10871    EARTHQUAKE SAFETY LOS ANGELES ÛÒ SAFETY FASTE...
10872    Storm in RI worse than last hurricane. My city...
10873    Green Line derailment in Chicago http://t.co/U...
10874    MEG issues Hazardous Weather Outlook (HWO) htt...
10875    #CityofCalgary has activated its Municipal Eme...
Name: text, Length: 10876, dtype: object

In [85]:
all_data['text'] = all_data['text'].map(lambda x : cleaning_text(x))
all_data['text']

0        our deeds are the reason of this earthquake ma...
1                    forest fire near la ronge sask canada
2        all residents asked to shelter in place are be...
3         people receive wildfires evacuation orders in...
4        just got sent this photo from ruby alaska as s...
                               ...                        
10871    earthquake safety los angeles ûò safety faste...
10872    storm in ri worse than last hurricane my  hard...
10873                    green line derailment in chicago 
10874            meg issues hazardous weather outlook hwo 
10875    cityofcalgary has activated its municipal emer...
Name: text, Length: 10876, dtype: object

In [86]:
from wordcloud import WordCloud

In [87]:
'''fig, ax = plt.subplots(1,2,figsize=(15,10))
wordcloud1 = WordCloud().generate( ' '.join(all_data[all_data['target']==1]['text']) )
ax[0].imshow(wordcloud1)
ax[0].axis('off')
ax[0].set_title('Real',fontsize=40)

wordcloud2 = WordCloud().generate( ' '.join(all_data[all_data['target']==0]['text']) )
ax[1].imshow(wordcloud2)
ax[1].axis('off')
ax[1].set_title('Fake',fontsize=40)'''

"fig, ax = plt.subplots(1,2,figsize=(15,10))\nwordcloud1 = WordCloud().generate( ' '.join(all_data[all_data['target']==1]['text']) )\nax[0].imshow(wordcloud1)\nax[0].axis('off')\nax[0].set_title('Real',fontsize=40)\n\nwordcloud2 = WordCloud().generate( ' '.join(all_data[all_data['target']==0]['text']) )\nax[1].imshow(wordcloud2)\nax[1].axis('off')\nax[1].set_title('Fake',fontsize=40)"

## Tokenization

In [88]:
tokenizer = nltk.tokenize.RegexpTokenizer('\w+')

In [89]:
all_data['text']

0        our deeds are the reason of this earthquake ma...
1                    forest fire near la ronge sask canada
2        all residents asked to shelter in place are be...
3         people receive wildfires evacuation orders in...
4        just got sent this photo from ruby alaska as s...
                               ...                        
10871    earthquake safety los angeles ûò safety faste...
10872    storm in ri worse than last hurricane my  hard...
10873                    green line derailment in chicago 
10874            meg issues hazardous weather outlook hwo 
10875    cityofcalgary has activated its municipal emer...
Name: text, Length: 10876, dtype: object

In [90]:
all_data['text'] = all_data['text'].map(lambda x : tokenizer.tokenize(x))
all_data['text']

0        [our, deeds, are, the, reason, of, this, earth...
1            [forest, fire, near, la, ronge, sask, canada]
2        [all, residents, asked, to, shelter, in, place...
3        [people, receive, wildfires, evacuation, order...
4        [just, got, sent, this, photo, from, ruby, ala...
                               ...                        
10871    [earthquake, safety, los, angeles, ûò, safety,...
10872    [storm, in, ri, worse, than, last, hurricane, ...
10873               [green, line, derailment, in, chicago]
10874      [meg, issues, hazardous, weather, outlook, hwo]
10875    [cityofcalgary, has, activated, its, municipal...
Name: text, Length: 10876, dtype: object

## Remove Stopwords

In [91]:
def remove_stopwords(word_list):
    return [word for word in word_list if word not in stopwords.words('english')]

In [92]:
all_data['text'] = all_data['text'].map(lambda x : remove_stopwords(x))
all_data['text']

0        [deeds, reason, earthquake, may, allah, forgiv...
1            [forest, fire, near, la, ronge, sask, canada]
2        [residents, asked, shelter, place, notified, o...
3        [people, receive, wildfires, evacuation, order...
4        [got, sent, photo, ruby, alaska, smoke, wildfi...
                               ...                        
10871    [earthquake, safety, los, angeles, ûò, safety,...
10872    [storm, ri, worse, last, hurricane, hardest, h...
10873                   [green, line, derailment, chicago]
10874      [meg, issues, hazardous, weather, outlook, hwo]
10875    [cityofcalgary, activated, municipal, emergenc...
Name: text, Length: 10876, dtype: object

In [93]:
def combine_list(word_list):
    return ' '.join(word_list)

In [94]:
all_data['text'] = all_data['text'].map(lambda x : combine_list(x))
all_data['text']

0             deeds reason earthquake may allah forgive us
1                    forest fire near la ronge sask canada
2        residents asked shelter place notified officer...
3        people receive wildfires evacuation orders cal...
4        got sent photo ruby alaska smoke wildfires pou...
                               ...                        
10871    earthquake safety los angeles ûò safety fasten...
10872    storm ri worse last hurricane hardest hit yard...
10873                        green line derailment chicago
10874             meg issues hazardous weather outlook hwo
10875    cityofcalgary activated municipal emergency pl...
Name: text, Length: 10876, dtype: object

# Transfrom Tokens to Vector

In [136]:
count_vectorizer = CountVectorizer()
train_vector = count_vectorizer.fit_transform(all_data.loc[:train.shape[0]-1,'text'])
test_vector = count_vectorizer.transform(all_data.loc[train.shape[0]:,'text'])

In [137]:
train.shape, test.shape

((7613, 5), (3263, 4))

In [138]:
train_vector.shape, test_vector.shape

((7613, 16412), (3263, 16412))

In [139]:
tf_vectorizer = TfidfVectorizer(min_df=2,max_df=0.5,ngram_range=(1,2))
train_tf_vector = tf_vectorizer.fit_transform(all_data.loc[:train.shape[0]-1,'text'])
test_tf_vector = tf_vectorizer.transform(all_data.loc[train.shape[0]:,'text'])

In [140]:
train_tf_vector.todense().shape

(7613, 11077)

# Text Classification Model

In [123]:
model1 = LogisticRegression()
model_selection.cross_val_score(model1,train_vector,train['target'],cv=5,scoring='f1').mean()

0.5834476966398702

In [124]:
model1_tf = LogisticRegression()
model_selection.cross_val_score(model1_tf,train_tf_vector,train['target'],cv=5,scoring='f1').mean()

0.5451346913743611

In [125]:
model2 = MultinomialNB()
model_selection.cross_val_score(model2,train_vector,train['target'],cv=5,scoring='f1').mean()

0.6584930948850116

In [126]:
model2_tf = MultinomialNB()
model_selection.cross_val_score(model2_tf,train_tf_vector,train['target'],cv=5,scoring='f1').mean()

0.6187711183101462

In [128]:
model3 = xgb.XGBClassifier()
model_selection.cross_val_score(model3,train_vector,train['target'],cv=5,scoring='f1').mean()





0.4526923263527385

In [129]:
model3_tf = xgb.XGBClassifier()
model_selection.cross_val_score(model3_tf,train_tf_vector,train['target'],cv=5,scoring='f1').mean()





0.44044956843219063

In [142]:
submission = pd.read_csv('sample_submission.csv')
model2.fit(train_vector,train['target'])
submission['target'] = model2.predict(test_vector)
submission.to_csv('submission.csv',index=False)