# Lab 07: Airline Tweets Sentiment Analysis

---
author: Yiran Hu
date: April 8, 2024
embed-resources: true
---

## Introduction

## Methods

In [310]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.naive_bayes import MultinomialNB
from sklearn.pipeline import make_pipeline
from sklearn.metrics import accuracy_score
from joblib import dump

### Data

In [311]:
tweets = pd.read_csv("https://cs307.org/lab-07/data/tweets.csv")

In [312]:
tweets_train, tweets_test = train_test_split(
    tweets,
    test_size=0.25,
    random_state=42,
)

# create X and y for train data
X_train = tweets_train["text"]
y_train = tweets_train["sentiment"]

# create X and y for test data
X_test = tweets_test["text"]
y_test = tweets_test["sentiment"]

In [313]:
tweets_train

Unnamed: 0,sentiment,airline,text
2233,positive,Delta,@JetBlue Then en route to the airport the rebo...
10733,negative,United,@united now you've lost my bags too. At least...
400,neutral,US Airways,"@USAirways Hi, can you attach my AA FF# 94LXA6..."
7615,positive,United,"@United, will you fill it? Yes they will. Than..."
4099,negative,American,@AmericanAir thanks! I hope we get movies. Tv'...
...,...,...,...
5734,negative,United,@united Can i get a refund? I would like to bo...
5191,neutral,Virgin America,@VirginAmerica what is your policy on flying a...
5390,negative,United,@united I'm not sure how you can help. Your fl...
860,neutral,Virgin America,@VirginAmerica LAX to EWR - Middle seat on a r...


In [314]:
# find the 100 most common words in the train tweets
top_100_counter = CountVectorizer(max_features=10000)
X_top_100 = top_100_counter.fit_transform(X_train)
print("Top 100 Words:")
print(top_100_counter.get_feature_names_out())
print("")
X_top_100_dense = X_top_100.todense() # type: ignore
X_top_100_dense

Top 100 Words:
['00' '000' '000114' ... 'ztrdwv0n4l' 'zurich' 'zv2pt6trk9']



matrix([[0, 0, 0, ..., 0, 0, 0],
        [0, 0, 0, ..., 0, 0, 0],
        [0, 0, 0, ..., 0, 0, 0],
        ...,
        [0, 0, 0, ..., 0, 0, 0],
        [0, 0, 0, ..., 0, 0, 0],
        [0, 0, 0, ..., 0, 0, 0]], dtype=int64)

#### Summary Statistics

In [316]:
# Counts and Proportions
airline_count=tweets_train.groupby('airline').agg('count').reset_index()
airline_count['proportion']=airline_count['text']/airline_count['text'].sum()
airline_count

Unnamed: 0,airline,sentiment,text,proportion
0,American,1551,1551,0.188342
1,Delta,1239,1239,0.150455
2,Southwest,1347,1347,0.16357
3,US Airways,1664,1664,0.202064
4,United,2166,2166,0.263024
5,Virgin America,268,268,0.032544


In [317]:
# word occurrence in each airline
word_counter = CountVectorizer()
word_counts = word_counter.fit_transform(X_train)
# find "cancelled", "please", "thanks"
for key, value in word_counter.vocabulary_.items():
    if key in ["cancelled", "please", "thanks"]:
        print(f"{key}: {word_counts[:, value].sum()}") # type: ignore

thanks: 587
please: 325
cancelled: 595


#### Visualization

In [318]:
# visualizations

### Models

In [334]:
pipeline = make_pipeline(
    CountVectorizer(max_features=10000, ngram_range=(1, 3)),
    MultinomialNB(alpha=2.41, fit_prior=False)
)
pipeline.fit(X_train, y_train)

predictions = pipeline.predict(X_test)
accuracy = accuracy_score(y_test, predictions)
print(f"Accuracy: {accuracy}")

Accuracy: 0.7941712204007286


In [335]:
dump(pipeline, "airline-sentiment.joblib")

['airline-sentiment.joblib']

## Results

In [325]:
# report model metrics

## Discussion

### Conclusion