In [1]:
import numpy as np 
import pandas as pd
from nltk.tokenize import TweetTokenizer
from nltk.corpus import stopwords
import string, re
from nltk.stem import WordNetLemmatizer
from sklearn.feature_extraction.text import CountVectorizer,TfidfVectorizer
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import classification_report

# Importing Data

In [2]:
tweets = pd.read_csv('Tweets1.csv')

In [3]:
tweets.head()


Unnamed: 0,label,tweet
0,neutral,@VirginAmerica What @dhepburn said.
1,positive,@VirginAmerica plus you've added commercials t...
2,neutral,@VirginAmerica I didn't today... Must mean I n...
3,negative,@VirginAmerica it's really aggressive to blast...
4,negative,@VirginAmerica and it's a really big bad thing...


In [4]:
tweets_filtered = tweets[tweets['label'] != 'neutral']


In [5]:
tweets_filtered

Unnamed: 0,label,tweet
1,positive,@VirginAmerica plus you've added commercials t...
3,negative,@VirginAmerica it's really aggressive to blast...
4,negative,@VirginAmerica and it's a really big bad thing...
5,negative,@VirginAmerica seriously would pay $30 a fligh...
6,positive,"@VirginAmerica yes, nearly every time I fly VX..."
...,...,...
14633,negative,@AmericanAir my flight was Cancelled Flightled...
14634,negative,@AmericanAir right on cue with the delays👌
14635,positive,@AmericanAir thank you we got on a different f...
14636,negative,@AmericanAir leaving over 20 minutes Late Flig...


# Data Preprocessing 

In [6]:
def clean_tweet(tweet):
    lemmatizer = WordNetLemmatizer() # lemmatizer return word in it's base noun (root)
    twt_tokenizer= TweetTokenizer(strip_handles=True) # tokenizer convert string into token
    tokens = twt_tokenizer.tokenize(tweet) 
    stops = stopwords.words('english')+ list(string.punctuation) # list of stopwords and punctuation
    no_stops_sentence = []
    for w in tokens: # removing stop words
        if w not in stops:
            no_stops_sentence.append(w)
    no_hashtag_sentence = []
    for w in no_stops_sentence: # removing # from words using regex
        no_hashtag_sentence.append(re.sub('#','',w))
    no_url_sentence=[]
    for w in no_hashtag_sentence: # removing www from urls using regex
        no_url_sentence.append(re.sub('www\S','',w))
    no_http_sentence=[]
    for w in no_url_sentence: # removing http from urls using regex
        no_http_sentence.append(re.sub('http','',w))
    lemma_sentence=[]
    for w in no_http_sentence: # lemmatizing all words to it's root
        lemma_sentence.append(lemmatizer.lemmatize(w))
    final_tokens=[]
    for w in lemma_sentence: # removing no length words
        if len(w)>1:
            final_tokens.append(w)
    final_sentence=""
    for w in final_tokens:
        final_sentence=final_sentence+' '+w
    return final_sentence


        
    

In [7]:
tweets_filtered['tweet']=tweets_filtered['tweet'].apply(clean_tweet)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  tweets_filtered['tweet']=tweets_filtered['tweet'].apply(clean_tweet)


In [8]:
y=tweets_filtered['label']

In [19]:
tweets_filtered

Unnamed: 0,label,tweet
1,positive,plus added commercial experience ... tacky
3,negative,really aggressive blast obnoxious entertainme...
4,negative,really big bad thing
5,negative,seriously would pay 30 flight seat playing re...
6,positive,yes nearly every time fly VX ear worm go away :)
...,...,...
14633,negative,flight Cancelled Flightled leaving tomorrow m...
14634,negative,right cue delay
14635,positive,thank got different flight Chicago
14636,negative,leaving 20 minute Late Flight No warning comm...


# Feature engineering

In [10]:
bow_vectorizer = CountVectorizer() #bag of words 
X_bow = bow_vectorizer.fit_transform(tweets_filtered['tweet']) # train and transform dataset into number

In [11]:
tfidf_vectorizer = TfidfVectorizer()
X_tfidf = tfidf_vectorizer.fit_transform(tweets_filtered['tweet'])

## train_test_split

In [12]:
X_bow_train, X_bow_test, y_bow_train, y_bow_test = train_test_split(
    X_bow, y, test_size=0.20, random_state=42)

In [13]:
X_tfidf_train, X_tfidf_test, y_tfidf_train, y_tfidf_test = train_test_split(
    X_tfidf, y, test_size=0.20, random_state=42)

# Modeling

### Random forest with bow

In [14]:
clf = RandomForestClassifier(random_state=99) 
clf.fit(X_bow_train, y_bow_train)
clf.score(X_bow_test, y_bow_test)

0.8973581637072325

In [34]:
y_pred_bow_proba=clf.predict_proba(X_bow_test)
y_pred_bow=[]
for x in y_pred_bow_proba:
    if(x[0]>0.55):
        y_pred_bow.append('negative')
    else:
        y_pred_bow.append('positive')

In [35]:
y_bow_test


11825    negative
8105     positive
1279     negative
14343    negative
3829     positive
           ...   
4927     positive
7527     negative
11445    negative
13388    positive
6482     negative
Name: label, Length: 2309, dtype: object

In [36]:
print(classification_report(y_bow_test, y_pred_bow))


              precision    recall  f1-score   support

    negative       0.93      0.94      0.93      1862
    positive       0.74      0.69      0.72       447

    accuracy                           0.89      2309
   macro avg       0.83      0.82      0.82      2309
weighted avg       0.89      0.89      0.89      2309



### Random forest with tfidf

In [16]:
clf_tfidf = RandomForestClassifier(random_state=99) #
clf_tfidf.fit(X_tfidf_train, y_tfidf_train) #training
clf_tfidf.score(X_tfidf_test, y_tfidf_test)

0.8960588999566912

In [17]:
y_pred_tfidf=clf_tfidf.predict(X_bow_test)
print(classification_report(y_bow_test, y_pred_tfidf))


              precision    recall  f1-score   support

    negative       0.94      0.89      0.91      1862
    positive       0.62      0.77      0.69       447

    accuracy                           0.87      2309
   macro avg       0.78      0.83      0.80      2309
weighted avg       0.88      0.87      0.87      2309



In [18]:
from joblib import dump, load #dump save
dump(clf, 'rnd.joblib') 
dump(bow_vectorizer, 'bow.joblib') 


['bow.joblib']