In [10]:
import os, html, re, unicodedata
import pandas as pd
from openai import OpenAI
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix

#export OPENAI_API_KEY="sk-..."
client = OpenAI(api_key=os.getenv("OPENAI_API_KEY"))

In [12]:
#Load original Kaggle download
raw = pd.read_csv("Tweets.csv")

print("Shape:", raw.shape)
display(raw.head())

#Save for record
raw.to_csv("1_raw_tweets.csv", index=False)
print("1_raw_tweets.csv written.")


Shape: (14640, 15)


Unnamed: 0,tweet_id,airline_sentiment,airline_sentiment_confidence,negativereason,negativereason_confidence,airline,airline_sentiment_gold,name,negativereason_gold,retweet_count,text,tweet_coord,tweet_created,tweet_location,user_timezone
0,570306133677760513,neutral,1.0,,,Virgin America,,cairdin,,0,@VirginAmerica What @dhepburn said.,,24/02/15 11:35,,Eastern Time (US & Canada)
1,570301130888122368,positive,0.3486,,0.0,Virgin America,,jnardino,,0,@VirginAmerica plus you've added commercials t...,,24/02/15 11:15,,Pacific Time (US & Canada)
2,570301083672813571,neutral,0.6837,,,Virgin America,,yvonnalynn,,0,@VirginAmerica I didn't today... Must mean I n...,,24/02/15 11:15,Lets Play,Central Time (US & Canada)
3,570301031407624196,negative,1.0,Bad Flight,0.7033,Virgin America,,jnardino,,0,@VirginAmerica it's really aggressive to blast...,,24/02/15 11:15,,Pacific Time (US & Canada)
4,570300817074462722,negative,1.0,Can't Tell,1.0,Virgin America,,jnardino,,0,@VirginAmerica and it's a really big bad thing...,,24/02/15 11:14,,Pacific Time (US & Canada)


1_raw_tweets.csv written.


In [13]:
#Unescape HTML entities
raw["unescaped"] = raw["text"].apply(html.unescape)

print("After unescape shape:", raw[["unescaped"]].shape)
display(raw[["text","unescaped"]].head())

# 3.2 Save
raw.to_csv("2_unescaped.csv", index=False)
print("2_unescaped.csv written.")


After unescape shape: (14640, 1)


Unnamed: 0,text,unescaped
0,@VirginAmerica What @dhepburn said.,@VirginAmerica What @dhepburn said.
1,@VirginAmerica plus you've added commercials t...,@VirginAmerica plus you've added commercials t...
2,@VirginAmerica I didn't today... Must mean I n...,@VirginAmerica I didn't today... Must mean I n...
3,@VirginAmerica it's really aggressive to blast...,@VirginAmerica it's really aggressive to blast...
4,@VirginAmerica and it's a really big bad thing...,@VirginAmerica and it's a really big bad thing...


2_unescaped.csv written.


In [14]:
#Replace arrows in the unescaped text
raw["routes"] = raw["unescaped"].str.replace(
    r'(\w+)\s*(?:->|>)\s*(\w+)', r'\1 to \2', regex=True
)

print("After routes shape:", raw[["routes"]].shape)
display(raw[["unescaped","routes"]].head())

# 4.2 Save
raw.to_csv("3_routes.csv", index=False)
print("3_routes.csv written.")


After routes shape: (14640, 1)


Unnamed: 0,unescaped,routes
0,@VirginAmerica What @dhepburn said.,@VirginAmerica What @dhepburn said.
1,@VirginAmerica plus you've added commercials t...,@VirginAmerica plus you've added commercials t...
2,@VirginAmerica I didn't today... Must mean I n...,@VirginAmerica I didn't today... Must mean I n...
3,@VirginAmerica it's really aggressive to blast...,@VirginAmerica it's really aggressive to blast...
4,@VirginAmerica and it's a really big bad thing...,@VirginAmerica and it's a really big bad thing...


3_routes.csv written.


In [15]:
#Strip only the '@', keep the handle text
raw["handles"] = raw["routes"].str.replace(r'@(\w+)', r'\1', regex=True)

print("After handles shape:", raw[["handles"]].shape)
display(raw[["routes","handles"]].head())

raw.to_csv("4_handles.csv", index=False)
print("4_handles.csv written.")


After handles shape: (14640, 1)


Unnamed: 0,routes,handles
0,@VirginAmerica What @dhepburn said.,VirginAmerica What dhepburn said.
1,@VirginAmerica plus you've added commercials t...,VirginAmerica plus you've added commercials to...
2,@VirginAmerica I didn't today... Must mean I n...,VirginAmerica I didn't today... Must mean I ne...
3,@VirginAmerica it's really aggressive to blast...,VirginAmerica it's really aggressive to blast ...
4,@VirginAmerica and it's a really big bad thing...,VirginAmerica and it's a really big bad thing ...


4_handles.csv written.


In [16]:
#Drop any http:// or https:// links
raw["no_urls"] = raw["handles"].str.replace(r"http[s]?://\S+", "", regex=True)

print("After URL removal shape:", raw[["no_urls"]].shape)
display(raw[["handles","no_urls"]].head())

raw.to_csv("5_nourls.csv", index=False)
print("5_nourls.csv written.")


After URL removal shape: (14640, 1)


Unnamed: 0,handles,no_urls
0,VirginAmerica What dhepburn said.,VirginAmerica What dhepburn said.
1,VirginAmerica plus you've added commercials to...,VirginAmerica plus you've added commercials to...
2,VirginAmerica I didn't today... Must mean I ne...,VirginAmerica I didn't today... Must mean I ne...
3,VirginAmerica it's really aggressive to blast ...,VirginAmerica it's really aggressive to blast ...
4,VirginAmerica and it's a really big bad thing ...,VirginAmerica and it's a really big bad thing ...


5_nourls.csv written.


In [17]:
#Lowercase + unicode normalize + collapse spaces
def finalize(s):
    s = s.lower()
    s = unicodedata.normalize("NFKC", s)
    s = s.encode("ascii","ignore").decode("ascii")
    return " ".join(s.split())

raw["prepped_text"] = raw["no_urls"].apply(finalize)

print("After lowercase/norm shape:", raw[["prepped_text"]].shape)
display(raw[["no_urls","prepped_text"]].head())

raw.to_csv("6_lower_norm.csv", index=False)
print("6_lower_norm.csv written.")


After lowercase/norm shape: (14640, 1)


Unnamed: 0,no_urls,prepped_text
0,VirginAmerica What dhepburn said.,virginamerica what dhepburn said.
1,VirginAmerica plus you've added commercials to...,virginamerica plus you've added commercials to...
2,VirginAmerica I didn't today... Must mean I ne...,virginamerica i didn't today... must mean i ne...
3,VirginAmerica it's really aggressive to blast ...,virginamerica it's really aggressive to blast ...
4,VirginAmerica and it's a really big bad thing ...,virginamerica and it's a really big bad thing ...


6_lower_norm.csv written.


In [18]:
#Drop duplicates on the final text
before = len(raw)
dedup = raw.drop_duplicates(subset=["prepped_text"], keep="first")
dropped = before - len(dedup)

print(f"Dropped {dropped} duplicates, new shape: {dedup.shape}")
display(dedup[["prepped_text","airline_sentiment"]].head())

dedup.to_csv("7_dedup.csv", index=False)
print("7_dedup.csv written.")


Dropped 287 duplicates, new shape: (14353, 20)


Unnamed: 0,prepped_text,airline_sentiment
0,virginamerica what dhepburn said.,neutral
1,virginamerica plus you've added commercials to...,positive
2,virginamerica i didn't today... must mean i ne...,neutral
3,virginamerica it's really aggressive to blast ...,negative
4,virginamerica and it's a really big bad thing ...,negative


7_dedup.csv written.


In [19]:
#Stratified 90/10 split
train_df, test_df = train_test_split(
    dedup, test_size=0.10, random_state=42,
    stratify=dedup["airline_sentiment"]
)

print("Train shape:", train_df.shape)
print("Test  shape:", test_df.shape)
display(test_df[["prepped_text","airline_sentiment"]].head())

train_df.to_csv("8_train.csv", index=False)
test_df .to_csv("9_test.csv",  index=False)
print("8_train.csv and 9_test.csv written.")


Train shape: (12917, 20)
Test  shape: (1436, 20)


Unnamed: 0,prepped_text,airline_sentiment
8949,"jetblue i have a internal bleed in my foot, an...",neutral
3774,"united just boarded ua1297, was refused from c...",negative
9386,usairways i find it funny that phlairport resp...,negative
4320,united i tried 2 dm it would not go thru... no...,negative
3121,really? 9+hours???? united: jenniferwalshpr we...,negative


8_train.csv and 9_test.csv written.


In [20]:
#Define your six-shot examples
examples = [
    ("i absolutely loved my flight with you today","positive"),
    ("smooth boarding and great service","positive"),
    ("the service was okay, nothing special","neutral"),
    ("it was an average experience overall","neutral"),
    ("my flight was cancelled and no one tells me why","negative"),
    ("delayed for hours with no update","negative"),
]

#Classify each test tweet
def classify(text):
    prompt = "Classify tweet sentiment as positive, neutral, or negative.\n\n"
    for ex, lbl in examples:
        prompt += f"Tweet: \"{ex}\" → {lbl}\n"
    prompt += f"\nTweet: \"{text}\" →"
    resp = client.chat.completions.create(
        model="gpt-3.5-turbo",
        messages=[{"role":"user","content":prompt}]
    )
    return resp.choices[0].message.content.strip().lower()

test_df["predicted"] = test_df["prepped_text"].apply(classify)

#Evaluate
from sklearn.metrics import accuracy_score, classification_report
acc = accuracy_score(test_df["airline_sentiment"], test_df["predicted"])
print(f"Test accuracy: {acc:.2%}")
print(classification_report(test_df["airline_sentiment"], test_df["predicted"],
                            labels=["positive","neutral","negative"]))

#Save predictions
test_df[["tweet_id","prepped_text","airline_sentiment","predicted"]] \
    .to_csv("10_preds.csv", index=False)
print("10_preds.csv written."). 


Test accuracy: 80.57%
              precision    recall  f1-score   support

    positive       0.86      0.78      0.81       227
     neutral       0.53      0.86      0.66       301
    negative       0.97      0.79      0.87       908

    accuracy                           0.81      1436
   macro avg       0.79      0.81      0.78      1436
weighted avg       0.86      0.81      0.82      1436

10_preds.csv written.
