In [1]:
import pandas as pd

# Load the train and test datasets
train_data = pd.read_csv('train.csv')
test_data = pd.read_csv('test.csv')

# Check the structure of the data
print(train_data.head())
print(test_data.head())


             tweet_id airline_sentiment    airline airline_sentiment_gold  \
0  567900433542488064          negative  Southwest                    NaN   
1  569989168903819264          positive  Southwest                    NaN   
2  568089179520954368          positive     United                    NaN   
3  568928195581513728          negative  Southwest                    NaN   
4  568594180014014464          negative     United                    NaN   

            name negativereason_gold  retweet_count  \
0  ColeyGirouard                 NaN              0   
1  WalterFaddoul                 NaN              0   
2      LocalKyle                 NaN              0   
3    amccarthy19                 NaN              0   
4        J_Okayy                 NaN              0   

                                                text tweet_coord  \
0  @SouthwestAir I am scheduled for the morning, ...         NaN   
1  @SouthwestAir seeing your workers time in and ...         NaN   
2 

In [2]:
import re
from nltk.corpus import stopwords
from nltk.stem import PorterStemmer
from nltk.tokenize import word_tokenize

# Ensure NLTK components are downloaded
import nltk
nltk.download('stopwords')
nltk.download('punkt')
nltk.download('punkt_tab')

# Initialize objects
stop_words = set(stopwords.words('english'))
ps = PorterStemmer()

# Text preprocessing function
def preprocess_text(text):
    text = text.lower()  # Convert to lowercase
    text = re.sub(r"http\S+|www\S+|https\S+", '', text, flags=re.MULTILINE)  # Remove URLs
    text = re.sub(r'\@\w+|\#', '', text)  # Remove mentions and hashtags
    text = re.sub(r'[^\w\s]', '', text)  # Remove punctuation
    tokens = word_tokenize(text)  # Tokenize
    filtered_words = [ps.stem(word) for word in tokens if word not in stop_words]  # Stemming and stopword removal
    return ' '.join(filtered_words)

# Apply preprocessing
train_data['cleaned_text'] = train_data['text'].apply(preprocess_text)
test_data['cleaned_text'] = test_data['text'].apply(preprocess_text)


[nltk_data] Downloading package stopwords to
[nltk_data]     /Users/jatinbhatt/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package punkt to
[nltk_data]     /Users/jatinbhatt/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package punkt_tab to
[nltk_data]     /Users/jatinbhatt/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt_tab.zip.


In [3]:
from sklearn.feature_extraction.text import TfidfVectorizer

# Initialize TF-IDF Vectorizer
tfidf_vect = TfidfVectorizer(max_features=5000)  # Adjust max_features as necessary

# Fit and transform training data
X_train = tfidf_vect.fit_transform(train_data['cleaned_text'])

# Transform test data (do not fit)
X_test = tfidf_vect.transform(test_data['cleaned_text'])


In [4]:
from sklearn.naive_bayes import MultinomialNB
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, classification_report

# Extract target variable
y_train = train_data['airline_sentiment']

# Split the data into training and validation sets (optional for cross-validation)
X_train_split, X_val_split, y_train_split, y_val_split = train_test_split(X_train, y_train, test_size=0.2, random_state=42)

# Train Naive Bayes model
nb_model = MultinomialNB()
nb_model.fit(X_train_split, y_train_split)

# Validate model
y_val_pred = nb_model.predict(X_val_split)
print(f"Validation Accuracy: {accuracy_score(y_val_split, y_val_pred)}")
print(classification_report(y_val_split, y_val_pred))


Validation Accuracy: 0.7076502732240437
              precision    recall  f1-score   support

    negative       0.69      0.99      0.82      1356
     neutral       0.74      0.21      0.32       458
    positive       0.89      0.30      0.45       382

    accuracy                           0.71      2196
   macro avg       0.77      0.50      0.53      2196
weighted avg       0.74      0.71      0.65      2196



In [5]:
# Make predictions on test data
y_test_pred = nb_model.predict(X_test)

# Save predictions to CSV without headers
pd.DataFrame(y_test_pred).to_csv('predictions.csv', index=False, header=False)
