## Twitter Sentiment Analysis with Random Forest

In [2]:
import warnings
warnings.filterwarnings('ignore')

import pandas as pd

df = pd.read_csv('clean_tweet.csv', index_col=[0])
df.head()

Unnamed: 0,id,keyword,location,text,target,char_counts,word_counts,avg_wordlength,stopwords_counts,hashtag_counts,mentions_counts,digits_counts,uppercase_counts
0,1,,,our deeds are the reason of this earthquake ma...,1,57,13,4.384615,6,1,0,0,1
1,4,,,forest fire near la ronge sask canada,1,32,7,4.571429,0,0,0,1,0
2,5,,,all residents asked to shelter in place are be...,1,112,22,5.090909,9,0,0,1,0
3,6,,,130 people receive wildfires evacuation orders...,1,57,8,7.125,1,1,0,1,0
4,7,,,just got sent this photo from ruby alaska as s...,1,72,16,4.5,6,2,0,0,0


In [3]:
X = df['text']
y = df['target']

In [4]:
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(X,y, test_size=0.2, random_state=42)

In [5]:
X_train.shape,X_test.shape

((6090,), (1523,))

In [6]:
from wordcloud import STOPWORDS
stopwords = set(STOPWORDS)


stopwords_list = list(stopwords)


In [7]:
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.ensemble import RandomForestClassifier
from sklearn.pipeline import Pipeline

# Create a list of stopwords
stopwords_list = list(stopwords)

# Define the pipeline
clf = Pipeline([
    ('tfidf', TfidfVectorizer(stop_words=stopwords_list)),
    ('clf', RandomForestClassifier(n_estimators=100, n_jobs=-1))
])

# Fit the model
clf.fit(X_train, y_train)

# Evaluate the model
from sklearn.metrics import accuracy_score,classification_report


predictions = clf.predict(X_test)
print("accuracy score :",accuracy_score(y_test, predictions))
print(classification_report(y_test,predictions))


accuracy score : 0.778069599474721
              precision    recall  f1-score   support

           0       0.78      0.85      0.81       874
           1       0.77      0.68      0.72       649

    accuracy                           0.78      1523
   macro avg       0.78      0.77      0.77      1523
weighted avg       0.78      0.78      0.78      1523



In [8]:
import pickle

pickle.dump(clf, open('model/RF_twitter_sentiment.pkl', 'wb'))

In [9]:
clf.predict(['earthquake'])

array([1], dtype=int64)

In [10]:
def predict_disaster(sentences, model):
    predictions = model.predict(sentences)
    for sentence, pred in zip(sentences, predictions):
        # Interpret the prediction
        prediction_label = 'Disaster Tweet' if pred == 1 else 'Normal Tweet'
        
        # Print the sentence and its prediction
        print(f"Sentence: {sentence}")
        print(f"Prediction: {prediction_label}\n")


In [11]:
sentences = [
  "An intense hurricane has made landfall, causing widespread flooding and destruction in coastal areas. Stay safe and evacuate if you can!",
    "Heard about #earthquake is different cities, stay safe everyone.","weather is very good to play cricket",
    "@RosieGray Now in all sincerety do you think the UN would move to Israel if there was a fraction of a chance of being annihilated?"
  ]
result = predict_disaster(sentences,clf)


Sentence: An intense hurricane has made landfall, causing widespread flooding and destruction in coastal areas. Stay safe and evacuate if you can!
Prediction: Disaster Tweet

Sentence: Heard about #earthquake is different cities, stay safe everyone.
Prediction: Disaster Tweet

Sentence: weather is very good to play cricket
Prediction: Normal Tweet

Sentence: @RosieGray Now in all sincerety do you think the UN would move to Israel if there was a fraction of a chance of being annihilated?
Prediction: Normal Tweet

