### Sentiment Analysis - Real or Fake Disaster Tweets

In [30]:
import pandas as pd
import numpy as np

train_data = pd.read_csv('data_set/train.csv')
test_data = pd.read_csv('data_set/test.csv')
# train_df = train_data[['id', 'keyword', 'text', 'target']]

train_df = train_data.iloc[0:, 0:5]
test_df = test_data.iloc[0:, 0:5]
test_df.head()

Unnamed: 0,id,keyword,location,text
0,0,,,Just happened a terrible car crash
1,2,,,"Heard about #earthquake is different cities, s..."
2,3,,,"there is a forest fire at spot pond, geese are..."
3,9,,,Apocalypse lighting. #Spokane #wildfires
4,11,,,Typhoon Soudelor kills 28 in China and Taiwan


In [31]:
real_disaster = train_df[train_df['target'] == '1']
real_disaster.head()

Unnamed: 0,id,keyword,location,text,target
0,1,,,Our Deeds are the Reason of this #earthquake M...,1
1,4,,,Forest fire near La Ronge Sask. Canada,1
2,5,,,All residents asked to 'shelter in place' are ...,1
3,6,,,"13,000 people receive #wildfires evacuation or...",1
4,7,,,Just got sent this photo from Ruby #Alaska as ...,1


In [32]:
fake_disaster = train_df[train_df['target'] == '0']
fake_disaster.head()

Unnamed: 0,id,keyword,location,text,target
15,23,,,What's up man?,0
16,24,,,I love fruits,0
17,25,,,Summer is lovely,0
18,26,,,My car is so fast,0
19,28,,,What a goooooooaaaaaal!!!!!!,0


#### Pre-Processor Method: Remove noise from the text

In [33]:
import re
from string import punctuation

def remove_noise_from_text(review):
    review = re.sub(r'[`=~!@#$%^&*()_+\[\]{};\\:"|<,./<>?^]', ' ', review)
    words = review.split()
    new_review = str()
    for word in words:
        word = word.lower()
        word = word.strip(punctuation)
        word = word.strip()
        new_review += word + " "
    return new_review[:len(new_review)-1]
    

#### Remove noise from Real Disaster training 

In [34]:
for i in range(len(real_disaster)):
    str_data = str(real_disaster['text'].values[i])
    real_disaster['text'].values[i] = remove_noise_from_text(str_data)
real_disaster.head()

#### Remove noise from all fake disaster tweet

In [35]:
for i in range(len(fake_disaster)):
    str_data = str(fake_disaster['text'].values[i])
    fake_disaster['text'].values[i] = remove_noise_from_text(str_data)
fake_disaster.head()

### Preprocess Train Data

In [36]:
for i in range(len(train_df)):
    str_data = str(train_df['text'].values[i])
    train_df['text'].values[i] = remove_noise_from_text(str_data)
train_df.head()

### Preprocess Test Data

In [64]:
for i in range(len(test_df)):
    str_data = str(test_df['text'].values[i])
    test_df['text'].values[i] = remove_noise_from_text(str_data)
test_df.head()

Unnamed: 0,id,keyword,location,text
0,0,,,just happened a terrible car crash
1,2,,,heard about earthquake is different cities sta...
2,3,,,there is a forest fire at spot pond geese are ...
3,9,,,apocalypse lighting spokane wildfires
4,11,,,typhoon soudelor kills 28 in china and taiwan


In [67]:
train_df.fillna('', inplace=True)
train_df.head(100)

Unnamed: 0,id,keyword,location,text,target
0,1,,,Our Deeds are the Reason of this #earthquake M...,1
1,4,,,Forest fire near La Ronge Sask. Canada,1
2,5,,,All residents asked to 'shelter in place' are ...,1
3,6,,,"13,000 people receive #wildfires evacuation or...",1
4,7,,,Just got sent this photo from Ruby #Alaska as ...,1
...,...,...,...,...,...
95,137,,Charlotte,9 Mile backup on I-77 South...accident blockin...,1
96,138,,"Baton Rouge, LA",Has an accident changed your life? We will hel...,0
97,139,,"Hagerstown, MD",#BREAKING: there was a deadly motorcycle car a...,1
98,141,,"Gloucestershire , UK",@flowri were you marinading it or was it an ac...,0


In [68]:
from sklearn.pipeline import Pipeline
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfTransformer
from sklearn.naive_bayes import MultinomialNB


# train_df['keyword'] = train_df['keyword'].replace(0, np.nan, inplace= True)
# train_df['location'].replace(0, np.nan, inplace= True)
# train_df['text'].replace(0, np.nan, inplace= True)
# train_df['target'].replace(0, np.nan, inplace= True)

train_df.head()

y = train_df['target']
x = train_df['text']


pipeline = Pipeline([
    ('bow', CountVectorizer()),
    ('tfidf', TfidfTransformer()),
    ('clf', MultinomialNB()),
])

pipeline.fit(x, y)



Pipeline(memory=None,
         steps=[('bow',
                 CountVectorizer(analyzer='word', binary=False,
                                 decode_error='strict',
                                 dtype=<class 'numpy.int64'>, encoding='utf-8',
                                 input='content', lowercase=True, max_df=1.0,
                                 max_features=None, min_df=1,
                                 ngram_range=(1, 1), preprocessor=None,
                                 stop_words=None, strip_accents=None,
                                 token_pattern='(?u)\\b\\w\\w+\\b',
                                 tokenizer=None, vocabulary=None)),
                ('tfidf',
                 TfidfTransformer(norm='l2', smooth_idf=True,
                                  sublinear_tf=False, use_idf=True)),
                ('clf',
                 MultinomialNB(alpha=1.0, class_prior=None, fit_prior=True))],
         verbose=False)