# Step 1 - Load The Dataset

In [2]:
import pandas as pd

df = pd.read_csv("Tweets.csv")
df = df[["airline_sentiment", "text"]]
df.head()

Unnamed: 0,airline_sentiment,text
0,neutral,@VirginAmerica What @dhepburn said.
1,positive,@VirginAmerica plus you've added commercials t...
2,neutral,@VirginAmerica I didn't today... Must mean I n...
3,negative,@VirginAmerica it's really aggressive to blast...
4,negative,@VirginAmerica and it's a really big bad thing...


#Step 2 - Preprocess Text

In [3]:
import nltk
import string
import re
from nltk.stem.porter import PorterStemmer

nltk.download('stopwords')
nltk.download('punkt')

from nltk.corpus import stopwords

ps = PorterStemmer()

def clean_text(text):
    text = text.lower()
    text = re.sub(r'http.?://[^\s]+[\s]?', '', text)
    text = nltk.word_tokenize(text)
    y = []
    for i in text:
        if i not in stopwords.words('english'):
            y.append(i)
    text = y[:]
    y.clear()
    for i in text:
        y.append(ps.stem(i))
    return " ".join(y)

# Apply the clean_text function to the text column
df['text_cleaned'] = df['text'].apply(clean_text)

# Display the first few rows of the updated dataset
df.head()


[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.
[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.


Unnamed: 0,airline_sentiment,text,text_cleaned
0,neutral,@VirginAmerica What @dhepburn said.,@ virginamerica @ dhepburn said .
1,positive,@VirginAmerica plus you've added commercials t...,@ virginamerica plu 've ad commerci experi ......
2,neutral,@VirginAmerica I didn't today... Must mean I n...,@ virginamerica n't today ... must mean need t...
3,negative,@VirginAmerica it's really aggressive to blast...,@ virginamerica 's realli aggress blast obnoxi...
4,negative,@VirginAmerica and it's a really big bad thing...,@ virginamerica 's realli big bad thing


#Step 3 - Feature Extraction

In [6]:
from sklearn.feature_extraction.text import TfidfVectorizer

# Create TfidfVectorizer
vectorizer = TfidfVectorizer(max_features=3000)

# Generate TF-IDF vectors
X = vectorizer.fit_transform(df['text_cleaned']).toarray()

# Convert the sentiment labels to an array
Y = df['airline_sentiment'].values

# Display the shape of the resulting arrays
print(X.shape, Y.shape)


(14640, 3000) (14640,)


# Step 4 - Train Model

In [7]:
from sklearn.model_selection import train_test_split
from sklearn.naive_bayes import MultinomialNB
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score

# Split the dataset into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, Y, test_size=0.2, random_state=2)

# Train a multinomial Naive Bayes classifier
nb_model = MultinomialNB()
nb_model.fit(X_train, y_train)
y_pred_nb = nb_model.predict(X_test)
nb_accuracy = accuracy_score(y_test, y_pred_nb)

# Train a Random Forest classifier
rf_model = RandomForestClassifier()
rf_model.fit(X_train, y_train)
y_pred_rf = rf_model.predict(X_test)
rf_accuracy = accuracy_score(y_test, y_pred_rf)

# Print the accuracy of both models
print(f"Naive Bayes Accuracy: {nb_accuracy}")
print(f"Random Forest Accuracy: {rf_accuracy}")


Naive Bayes Accuracy: 0.7219945355191257
Random Forest Accuracy: 0.7523907103825137


In [8]:
tweet = "@VirginAmerica What @dhepburn said."
cleaned_tweet = clean_text(tweet)
print(cleaned_tweet)


@ virginamerica @ dhepburn said .


In [9]:
# Group by 'airline_sentiment' and get the size of each group
sentiment_counts = df.groupby('airline_sentiment').size()
print(sentiment_counts)


airline_sentiment
negative    9178
neutral     3099
positive    2363
dtype: int64


In [11]:
# Filter for neutral tweets
neutral_tweets = df[df['airline_sentiment'] == 'neutral']

# Count the number of unique neutral tweets
unique_neutral_tweets_count = neutral_tweets['text'].nunique()
print(unique_neutral_tweets_count)

3067
