In [1]:
import pandas as pd
import numpy as np
import re
import string
import nltk
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
from nltk import pos_tag

In [5]:
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.preprocessing import LabelEncoder
from sklearn.metrics import accuracy_score, classification_report

In [2]:
nltk.download('stopwords')
nltk.download('wordnet')
nltk.download('averaged_perceptron_tagger')

[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\Chirag\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\Chirag\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     C:\Users\Chirag\AppData\Roaming\nltk_data...
[nltk_data]   Package averaged_perceptron_tagger is already up-to-
[nltk_data]       date!


True

In [7]:
# Replace with your actual filenames
train_df = pd.read_csv("training_twitter_x_y_train.csv")
test_df = pd.read_csv("test_twitter_x_test.csv")

print(train_df.head())
print(train_df.columns)


             tweet_id airline_sentiment    airline airline_sentiment_gold  \
0  567900433542488064          negative  Southwest                    NaN   
1  569989168903819264          positive  Southwest                    NaN   
2  568089179520954368          positive     United                    NaN   
3  568928195581513728          negative  Southwest                    NaN   
4  568594180014014464          negative     United                    NaN   

            name negativereason_gold  retweet_count  \
0  ColeyGirouard                 NaN              0   
1  WalterFaddoul                 NaN              0   
2      LocalKyle                 NaN              0   
3    amccarthy19                 NaN              0   
4        J_Okayy                 NaN              0   

                                                text tweet_coord  \
0  @SouthwestAir I am scheduled for the morning, ...         NaN   
1  @SouthwestAir seeing your workers time in and ...         NaN   
2 

In [8]:
def get_simple_pos(tag):
    if tag.startswith('J'):
        return ADJ
    elif tag.startswith('V'):
        return VERB
    elif tag.startswith('N'):
        return NOUN
    elif tag.startswith('R'):
        return ADV
    else:
        return NOUN

In [None]:
stops = set(stopwords.words('english'))
lemmatizer = WordNetLemmatizer()

def clean_review(text):
    text = re.sub(r"http\S+|www\S+|https\S+", '', text)
    text = re.sub(r'\@\w+|\#', '', text)
    text = text.translate(str.maketrans('', '', string.punctuation))
    text = re.sub(r'\d+', '', text)
    text = text.strip()
    
    words = text.split()
    output_words = []
    for w in words:
        if w.lower() not in stops:
            pos = pos_tag([w])
            clean_word = lemmatizer.lemmatize(w, pos=get_simple_pos(pos[0][1]))
            output_words.append(clean_word.lower())
    
    return ' '.join(output_words)

In [10]:
train_df['clean_text'] = train_df['text'].apply(clean_review)
test_df['clean_text'] = test_df['text'].apply(clean_review)

In [11]:
le = LabelEncoder()
train_df['label'] = le.fit_transform(train_df['airline_sentiment'])
y = train_df['label']

In [12]:
tfidf = TfidfVectorizer(max_features=10000, ngram_range=(1,2))
X = tfidf.fit_transform(train_df['clean_text'])
X_test = tfidf.transform(test_df['clean_text'])

In [14]:
X_train, X_val, y_train, y_val = train_test_split(X, y, test_size=0.2, random_state=42)

In [15]:
model = LogisticRegression(max_iter=200)
model.fit(X_train, y_train)

y_pred = model.predict(X_val)
print("Validation Accuracy:", accuracy_score(y_val, y_pred))
print(classification_report(y_val, y_pred, target_names=le.classes_))

Validation Accuracy: 0.7718579234972678
              precision    recall  f1-score   support

    negative       0.79      0.95      0.86      1356
     neutral       0.64      0.43      0.52       458
    positive       0.85      0.55      0.67       382

    accuracy                           0.77      2196
   macro avg       0.76      0.65      0.68      2196
weighted avg       0.77      0.77      0.75      2196



In [16]:
final_pred = model.predict(X_test)
predictions = le.inverse_transform(final_pred)

# Save submission file — no header, one column only
pd.DataFrame(predictions).to_csv("submission.csv", index=False, header=False)