### Sentiment Analysis - Real or Fake Disaster Tweets

In [25]:
import pandas as pd
import numpy as np

train_data = pd.read_csv('data_set/train.csv')
test_data = pd.read_csv('data_set/test.csv')
# train_df = train_data[['id', 'keyword', 'text', 'target']]

train_df = train_data.iloc[0:, 0:5]
test_df = test_data.iloc[0:, 0:5]
print(len(test_df))

3263


In [26]:
real_disaster = train_df[train_df['target'] == '1']
real_disaster.head()

Unnamed: 0,id,keyword,location,text,target
0,1,,,Our Deeds are the Reason of this #earthquake M...,1
1,4,,,Forest fire near La Ronge Sask. Canada,1
2,5,,,All residents asked to 'shelter in place' are ...,1
3,6,,,"13,000 people receive #wildfires evacuation or...",1
4,7,,,Just got sent this photo from Ruby #Alaska as ...,1


In [27]:
fake_disaster = train_df[train_df['target'] == '0']
fake_disaster.head()

Unnamed: 0,id,keyword,location,text,target
15,23,,,What's up man?,0
16,24,,,I love fruits,0
17,25,,,Summer is lovely,0
18,26,,,My car is so fast,0
19,28,,,What a goooooooaaaaaal!!!!!!,0


#### Pre-Processor Method: Remove noise from the text

In [28]:
import re
from string import punctuation
from nltk.corpus import stopwords 
from nltk.tokenize import word_tokenize

stop_words = set(stopwords.words('english')) 

def remove_noise_from_text(review):
    review = re.sub(r'[`=~!@#$%^&*()_+\[\]{};\\:"|<,./<>?^]', ' ', review)
    words = review.split()
    new_review = str()
    for word in words:
        if word in stop_words:
            continue;
        else:
            word = word.lower()
            word = word.strip(punctuation)
            word = word.strip()
            new_review += word + " "
    return new_review[:len(new_review)-1]
    

#### Remove noise from Real Disaster training 

In [29]:
for i in range(len(real_disaster)):
    str_data = str(real_disaster['text'].values[i])
    real_disaster['text'].values[i] = remove_noise_from_text(str_data)
real_disaster.head()

Unnamed: 0,id,keyword,location,text,target
0,1,,,our deeds reason earthquake may allah forgive us,1
1,4,,,forest fire near la ronge sask canada,1
2,5,,,all residents asked shelter place notified off...,1
3,6,,,13 000 people receive wildfires evacuation ord...,1
4,7,,,just got sent photo ruby alaska smoke wildfire...,1


#### Remove noise from all fake disaster tweet

In [30]:
for i in range(len(fake_disaster)):
    str_data = str(fake_disaster['text'].values[i])
    fake_disaster['text'].values[i] = remove_noise_from_text(str_data)
fake_disaster.head()

Unnamed: 0,id,keyword,location,text,target
15,23,,,what's man,0
16,24,,,i love fruits,0
17,25,,,summer lovely,0
18,26,,,my car fast,0
19,28,,,what goooooooaaaaaal,0


### Preprocess Train Data

In [31]:
for i in range(len(train_df)):
    str_data = str(train_df['text'].values[i])
    train_df['text'].values[i] = remove_noise_from_text(str_data)
train_df.head()

Unnamed: 0,id,keyword,location,text,target
0,1,,,our deeds reason earthquake may allah forgive us,1
1,4,,,forest fire near la ronge sask canada,1
2,5,,,all residents asked shelter place notified off...,1
3,6,,,13 000 people receive wildfires evacuation ord...,1
4,7,,,just got sent photo ruby alaska smoke wildfire...,1


### Preprocess Test Data

In [32]:
for i in range(len(test_df)):
    str_data = str(test_df['text'].values[i])
    test_df['text'].values[i] = remove_noise_from_text(str_data)
test_df.head()

Unnamed: 0,id,keyword,location,text
0,0,,,just happened terrible car crash
1,2,,,heard earthquake different cities stay safe ev...
2,3,,,forest fire spot pond geese fleeing across str...
3,9,,,apocalypse lighting spokane wildfires
4,11,,,typhoon soudelor kills 28 china taiwan


### Replacing the NaN value with space ' '

In [33]:
train_df.fillna('', inplace=True)
train_df.head()
test_df.fillna('', inplace=True)
test_df.head()

Unnamed: 0,id,keyword,location,text
0,0,,,just happened terrible car crash
1,2,,,heard earthquake different cities stay safe ev...
2,3,,,forest fire spot pond geese fleeing across str...
3,9,,,apocalypse lighting spokane wildfires
4,11,,,typhoon soudelor kills 28 china taiwan


In [35]:
from sklearn.pipeline import Pipeline
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfTransformer
from sklearn.naive_bayes import MultinomialNB


# train_df['keyword'] = train_df['keyword'].replace(0, np.nan, inplace= True)
# train_df['location'].replace(0, np.nan, inplace= True)
# train_df['text'].replace(0, np.nan, inplace= True)
# train_df['target'].replace(0, np.nan, inplace= True)


x = train_df['text']
y = train_df['target']

pipeline = Pipeline([
    ('bow', CountVectorizer()),
    ('tfidf', TfidfTransformer()),
    ('clf', MultinomialNB(alpha=20)),
])

pipeline.fit(x, y)
sample_submission = pd.read_csv("data_set/sample_submission.csv")
sample_submission["target"] = pipeline.predict(test_df['text'])
sample_submission.to_csv("submission.csv", index=False)