<h1 align="center"> Twitter US Airline Sentiment Classification (Multi Class Text Classification) </h1>

<img 
    src="https://storage.googleapis.com/kaggle-datasets-images/17/17/4c65377be972703be4141abbe260d3ac/dataset-cover.jpeg" 
    alt="" 
    width="600" 
    height="400"
    style="display: block; margin: 0 auto; border-radius:15px" 
/>


* [Twitter US Airline Sentiment Dataset](https://www.kaggle.com/datasets/crowdflower/twitter-airline-sentiment)

---

In [1]:
import pandas as pd

import nltk
import string
import re
from nltk.stem.porter import PorterStemmer
nltk.download('stopwords')
from nltk.corpus import stopwords
nltk.download('punkt')

from sklearn.model_selection import train_test_split
from sklearn.naive_bayes import MultinomialNB
from sklearn.ensemble import RandomForestClassifier
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics import accuracy_score, classification_report


[nltk_data] Downloading package stopwords to /usr/share/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package punkt to /usr/share/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


In [2]:
data = pd.read_csv('/kaggle/input/twitter-airline-sentiment/Tweets.csv')

In [3]:
data.head()

Unnamed: 0,tweet_id,airline_sentiment,airline_sentiment_confidence,negativereason,negativereason_confidence,airline,airline_sentiment_gold,name,negativereason_gold,retweet_count,text,tweet_coord,tweet_created,tweet_location,user_timezone
0,570306133677760513,neutral,1.0,,,Virgin America,,cairdin,,0,@VirginAmerica What @dhepburn said.,,2015-02-24 11:35:52 -0800,,Eastern Time (US & Canada)
1,570301130888122368,positive,0.3486,,0.0,Virgin America,,jnardino,,0,@VirginAmerica plus you've added commercials t...,,2015-02-24 11:15:59 -0800,,Pacific Time (US & Canada)
2,570301083672813571,neutral,0.6837,,,Virgin America,,yvonnalynn,,0,@VirginAmerica I didn't today... Must mean I n...,,2015-02-24 11:15:48 -0800,Lets Play,Central Time (US & Canada)
3,570301031407624196,negative,1.0,Bad Flight,0.7033,Virgin America,,jnardino,,0,@VirginAmerica it's really aggressive to blast...,,2015-02-24 11:15:36 -0800,,Pacific Time (US & Canada)
4,570300817074462722,negative,1.0,Can't Tell,1.0,Virgin America,,jnardino,,0,@VirginAmerica and it's a really big bad thing...,,2015-02-24 11:14:45 -0800,,Pacific Time (US & Canada)


In [4]:
data.shape

(14640, 15)

In [5]:
data = data[["airline_sentiment", "text"]]

In [6]:
data.head()

Unnamed: 0,airline_sentiment,text
0,neutral,@VirginAmerica What @dhepburn said.
1,positive,@VirginAmerica plus you've added commercials t...
2,neutral,@VirginAmerica I didn't today... Must mean I n...
3,negative,@VirginAmerica it's really aggressive to blast...
4,negative,@VirginAmerica and it's a really big bad thing...


In [7]:
data.groupby('airline_sentiment').describe()

Unnamed: 0_level_0,text,text,text,text
Unnamed: 0_level_1,count,unique,top,freq
airline_sentiment,Unnamed: 1_level_2,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2
negative,9178,9087,@AmericanAir robocalls me with another Cancell...,2
neutral,3099,3067,@SouthwestAir sent,5
positive,2363,2298,@united thanks,5


In [8]:

def clean_text(text):
    ps = PorterStemmer()
    y = []
    
    text = text.lower()
    text = re.sub(r'http.?://[^\s]+[\s]?', '', text)
    text = nltk.word_tokenize(text)
    
    for i in text:
        if i not in stopwords.words('english'):
            y.append(i)

    text = y[:]
    y.clear()

    for i in text:
        y.append(ps.stem(i))

    return " ".join(y)


In [9]:
data['text_cleaned'] = data['text'].apply(clean_text)

In [10]:
data.head()

Unnamed: 0,airline_sentiment,text,text_cleaned
0,neutral,@VirginAmerica What @dhepburn said.,@ virginamerica @ dhepburn said .
1,positive,@VirginAmerica plus you've added commercials t...,@ virginamerica plu 've ad commerci experi ......
2,neutral,@VirginAmerica I didn't today... Must mean I n...,@ virginamerica n't today ... must mean need t...
3,negative,@VirginAmerica it's really aggressive to blast...,@ virginamerica 's realli aggress blast obnoxi...
4,negative,@VirginAmerica and it's a really big bad thing...,@ virginamerica 's realli big bad thing


In [11]:
tf_vec = TfidfVectorizer(max_features=3000)
X = tf_vec.fit_transform(data['text_cleaned']).toarray()
X.shape

(14640, 3000)

In [12]:
y = data['airline_sentiment'].values

In [13]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=2)

In [14]:
def train_models(models, X_train, X_test, y_train, y_test):

    for name, model in models.items():
        model.fit(X_train, y_train)

        y_pred = model.predict(X_test)
        score = accuracy_score(y_test, y_pred)

        print(f"{name} Accuracy Score: {score}")
        
        print(classification_report(y_test, y_pred))


In [15]:
models = {
    "Multinomial Naive Bayes": MultinomialNB(),
    "Random Forest": RandomForestClassifier(random_state=42)
}

train_models(models, X_train, X_test, y_train, y_test)


Multinomial Naive Bayes Accuracy Score: 0.7223360655737705
              precision    recall  f1-score   support

    negative       0.71      0.98      0.83      1791
     neutral       0.73      0.29      0.42       648
    positive       0.81      0.36      0.49       489

    accuracy                           0.72      2928
   macro avg       0.75      0.54      0.58      2928
weighted avg       0.73      0.72      0.68      2928

Random Forest Accuracy Score: 0.7476092896174863
              precision    recall  f1-score   support

    negative       0.77      0.93      0.85      1791
     neutral       0.63      0.41      0.50       648
    positive       0.74      0.51      0.60       489

    accuracy                           0.75      2928
   macro avg       0.71      0.62      0.65      2928
weighted avg       0.74      0.75      0.73      2928

