### Sentiment Analysis - Real or Fake Disaster Tweets

In [1]:
import pandas as pd
import numpy as np
import seaborn as sns

train_data = pd.read_csv('data_set/train.csv')
test_data = pd.read_csv('data_set/test.csv')
# train_df = train_data[['id', 'keyword', 'text', 'target']]

train_df = train_data.iloc[0:, 0:5]
test_df = test_data.iloc[0:, 0:5]
print(len(test_df))

3263


In [2]:
real_disaster = train_df[train_df['target'] == '1']
real_disaster.head()

Unnamed: 0,id,keyword,location,text,target
0,1,,,Our Deeds are the Reason of this #earthquake M...,1
1,4,,,Forest fire near La Ronge Sask. Canada,1
2,5,,,All residents asked to 'shelter in place' are ...,1
3,6,,,"13,000 people receive #wildfires evacuation or...",1
4,7,,,Just got sent this photo from Ruby #Alaska as ...,1


In [3]:
fake_disaster = train_df[train_df['target'] == '0']
fake_disaster.head()

Unnamed: 0,id,keyword,location,text,target
15,23,,,What's up man?,0
16,24,,,I love fruits,0
17,25,,,Summer is lovely,0
18,26,,,My car is so fast,0
19,28,,,What a goooooooaaaaaal!!!!!!,0


In [6]:
train_df.isnull().any()

id          False
keyword      True
location     True
text         True
target       True
dtype: bool

In [10]:
train_df[train_df['id', 'keyword', 'location', 'text', 'target']..isnull()]

Unnamed: 0,id,keyword,location,text,target
0,False,True,True,False,False
1,False,True,True,False,False
2,False,True,True,False,False
3,False,True,True,False,False
4,False,True,True,False,False
...,...,...,...,...,...
7608,False,True,True,False,False
7609,False,True,True,False,False
7610,False,True,True,False,False
7611,False,True,True,False,False


In [None]:
train_df[train_df['keyword'].isnull()]

#### Pre-Processor Method: Remove noise from the text

In [4]:
import re
from string import punctuation
from nltk.corpus import stopwords 
from nltk.tokenize import word_tokenize

stop_words = set(stopwords.words('english')) 

def remove_noise_from_text(review):
    review = str(review)
    review = re.sub(r'[`=~!@#$%^&*()_+\[\]{};\\:"|<,./<>?^]', ' ', review)
    words = review.split()
    new_review = str()
    for word in words:
        if word in stop_words:
            continue;
        else:
            word = word.lower()
            word = word.strip(punctuation)
            word = word.strip()
            new_review += word + " "
    return new_review[:len(new_review)-1]
    

### Preprocess Train Data

In [5]:
train_df['clean_text'] = train_df['text'].apply(remove_noise_from_text)
train_df.head()

Unnamed: 0,id,keyword,location,text,target,clean_text
0,1,,,Our Deeds are the Reason of this #earthquake M...,1,our deeds reason earthquake may allah forgive us
1,4,,,Forest fire near La Ronge Sask. Canada,1,forest fire near la ronge sask canada
2,5,,,All residents asked to 'shelter in place' are ...,1,all residents asked shelter place notified off...
3,6,,,"13,000 people receive #wildfires evacuation or...",1,13 000 people receive wildfires evacuation ord...
4,7,,,Just got sent this photo from Ruby #Alaska as ...,1,just got sent photo ruby alaska smoke wildfire...


### Preprocess Test Data

In [6]:
test_df['clean_text'] = test_df['text'].apply(remove_noise_from_text)
test_df.head()

Unnamed: 0,id,keyword,location,text,clean_text
0,0,,,Just happened a terrible car crash,just happened terrible car crash
1,2,,,"Heard about #earthquake is different cities, s...",heard earthquake different cities stay safe ev...
2,3,,,"there is a forest fire at spot pond, geese are...",forest fire spot pond geese fleeing across str...
3,9,,,Apocalypse lighting. #Spokane #wildfires,apocalypse lighting spokane wildfires
4,11,,,Typhoon Soudelor kills 28 in China and Taiwan,typhoon soudelor kills 28 china taiwan


### Replacing the NaN value with space ' '

In [7]:
train_df.fillna('', inplace=True)
train_df.head()
test_df.fillna('', inplace=True)
test_df.head()

Unnamed: 0,id,keyword,location,text,clean_text
0,0,,,Just happened a terrible car crash,just happened terrible car crash
1,2,,,"Heard about #earthquake is different cities, s...",heard earthquake different cities stay safe ev...
2,3,,,"there is a forest fire at spot pond, geese are...",forest fire spot pond geese fleeing across str...
3,9,,,Apocalypse lighting. #Spokane #wildfires,apocalypse lighting spokane wildfires
4,11,,,Typhoon Soudelor kills 28 in China and Taiwan,typhoon soudelor kills 28 china taiwan


## Naive Bayes Classifier

In [8]:
from sklearn.pipeline import Pipeline
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfTransformer
from sklearn.naive_bayes import MultinomialNB

x_train = train_df['clean_text']
target = train_df['target']
X_test = test_df['clean_text']

pipeline = Pipeline([
    ('bow', CountVectorizer()),
    ('tfidf', TfidfTransformer()),
    ('clf', MultinomialNB()),
])

pipeline.fit(x_train, target)
sample_submission = pd.read_csv("data_set/sample_submission.csv")
sample_submission["target"] = pipeline.predict(X_test)
sample_submission.to_csv("submission.csv", index=False)

## Logistic Regression

In [9]:
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import roc_auc_score
from sklearn.metrics import f1_score
from sklearn.feature_extraction.text import CountVectorizer

x_train = train_df['clean_text']
target = train_df['target']
X_test = test_df['clean_text']

pipeline = Pipeline([
    ('vect', CountVectorizer()),
    ('tfidf', TfidfTransformer()),
    ('clf', LogisticRegression()),
])

pipeline.fit(x_train, target)
sample_submission = pd.read_csv("data_set/sample_submission.csv")
sample_submission['target'] = pipeline.predict(X_test)
sample_submission.to_csv("submission_log.csv", index=False)

## K-Nearest Neighbor

In [15]:
from sklearn.neighbors import KNeighborsClassifier
from sklearn.cluster import KMeans
from sklearn.feature_extraction.text import TfidfVectorizer

x_train = train_df['clean_text']
target = train_df['target']
X_test = test_df['clean_text']

pipeline = Pipeline([
    ('vect', CountVectorizer()),
    ('tfidf', TfidfTransformer()),
    ('clf', KNeighborsClassifier(n_neighbors=20)),
])

pipeline.fit(x_train, target)
sample_submission = pd.read_csv("data_set/sample_submission.csv")
sample_submission["target"] = pipeline.predict(X_test)
sample_submission.to_csv("submission3.csv", index=False)