In [1]:
import numpy as np
import pandas as pd
import nltk
from sklearn.model_selection import train_test_split
from sklearn.svm import SVC
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfVectorizer

In [2]:
df = pd.read_csv('twitter_x_y_train.csv')

In [3]:
df.head()

Unnamed: 0,tweet_id,airline_sentiment,airline,airline_sentiment_gold,name,negativereason_gold,retweet_count,text,tweet_coord,tweet_created,tweet_location,user_timezone
0,567900433542488064,negative,Southwest,,ColeyGirouard,,0,"@SouthwestAir I am scheduled for the morning, ...",,2015-02-17 20:16:29 -0800,Washington D.C.,Atlantic Time (Canada)
1,569989168903819264,positive,Southwest,,WalterFaddoul,,0,@SouthwestAir seeing your workers time in and ...,,2015-02-23 14:36:22 -0800,"Indianapolis, Indiana; USA",Central Time (US & Canada)
2,568089179520954368,positive,United,,LocalKyle,,0,@united Flew ORD to Miami and back and had gr...,,2015-02-18 08:46:29 -0800,Illinois,Central Time (US & Canada)
3,568928195581513728,negative,Southwest,,amccarthy19,,0,@SouthwestAir @dultch97 that's horse radish 😤🐴,,2015-02-20 16:20:26 -0800,,Atlantic Time (Canada)
4,568594180014014464,negative,United,,J_Okayy,,0,@united so our flight into ORD was delayed bec...,,2015-02-19 18:13:11 -0800,,Eastern Time (US & Canada)


In [4]:
df.tail()

Unnamed: 0,tweet_id,airline_sentiment,airline,airline_sentiment_gold,name,negativereason_gold,retweet_count,text,tweet_coord,tweet_created,tweet_location,user_timezone
10975,569934458364813313,neutral,American,,Cottopanama85,,0,@AmericanAir followback,,2015-02-23 10:58:58 -0800,"ohio,panama",
10976,568564006329434113,positive,United,,PaulBEsteves,,0,@united thanks for the help. Wish the phone re...,,2015-02-19 16:13:17 -0800,Brooklyn,Eastern Time (US & Canada)
10977,569643648910028801,negative,US Airways,,runfixsteve,,0,@usairways the. Worst. Ever. #dca #customerser...,,2015-02-22 15:43:24 -0800,"St. Augustine, Florida",
10978,568864981917110272,negative,US Airways,,CLChicosky,,0,@nrhodes85: look! Another apology. DO NOT FLY ...,,2015-02-20 12:09:15 -0800,,
10979,568929299350179840,negative,United,,JW_Blocker,,1,@united you are by far the worst airline. 4 pl...,,2015-02-20 16:24:49 -0800,,


In [5]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 10980 entries, 0 to 10979
Data columns (total 12 columns):
 #   Column                  Non-Null Count  Dtype 
---  ------                  --------------  ----- 
 0   tweet_id                10980 non-null  int64 
 1   airline_sentiment       10980 non-null  object
 2   airline                 10980 non-null  object
 3   airline_sentiment_gold  31 non-null     object
 4   name                    10980 non-null  object
 5   negativereason_gold     24 non-null     object
 6   retweet_count           10980 non-null  int64 
 7   text                    10980 non-null  object
 8   tweet_coord             776 non-null    object
 9   tweet_created           10980 non-null  object
 10  tweet_location          7430 non-null   object
 11  user_timezone           7403 non-null   object
dtypes: int64(2), object(10)
memory usage: 1.0+ MB


In [6]:
df.isnull().sum()

tweet_id                      0
airline_sentiment             0
airline                       0
airline_sentiment_gold    10949
name                          0
negativereason_gold       10956
retweet_count                 0
text                          0
tweet_coord               10204
tweet_created                 0
tweet_location             3550
user_timezone              3577
dtype: int64

In [7]:
training_documents = df.text.values
training_categories = df.airline_sentiment.values

In [8]:
training_documents

array(['@SouthwestAir I am scheduled for the morning, 2 days after the fact, yes..not sure why my evening flight was the only one Cancelled Flightled',
       '@SouthwestAir seeing your workers time in and time out going above and beyond is why I love flying with you guys. Thank you!',
       '@united Flew ORD to Miami and back and  had great crew, service on both legs. THANKS',
       ..., '@usairways the. Worst. Ever. #dca #customerservice',
       '@nrhodes85: look! Another apology. DO NOT FLY @USAirways',
       '@united you are by far the worst airline. 4 plane delays on 1 round trip flight. How is that possible.'],
      dtype=object)

In [9]:
test = pd.read_csv('twitter_x_test.csv')
testing_documents = test.text.values
testing_documents[0]

"@AmericanAir In car gng to DFW. Pulled over 1hr ago - very icy roads. On-hold with AA since 1hr. Can't reach arpt for AA2450. Wat 2 do?"

In [10]:
from nltk.corpus import stopwords
import string

stops = set(stopwords.words('english'))
punctuations = string.punctuation
stops.update(punctuations)

stops = list(stops)

In [11]:
count_vec = TfidfVectorizer(max_features=5000, max_df = 0.8, min_df = 0.001,ngram_range=(1,3), analyzer='word', stop_words = stops)

x_train = count_vec.fit_transform(training_documents)
y_train = training_categories

x_test = count_vec.transform(testing_documents)

In [12]:
count_vec.get_feature_names_out()

array(['000', '10', '10 minutes', ..., 'york', 'yr', 'zero'], dtype=object)

In [13]:
svc = SVC()
svc.fit(x_train, y_train)
y_pred = svc.predict(x_test)

In [14]:
svc.score(x_train, y_train)

0.9265027322404371

In [15]:
np.savetxt('predict.csv', y_pred, fmt='%s')

In [16]:
from sklearn.model_selection import GridSearchCV

parameters = {
    'C': [0.1, 1, 10, 100],
    'gamma': [1, 0.1, 0.01, 0.001],
    'kernel': ['linear', 'rbf', 'poly']
}

grid_search = GridSearchCV(svc, parameters, cv=5)
grid_search.fit(x_train, y_train)

print(f"Best Parameters: {grid_search.best_params_}")
best_svc = grid_search.best_estimator_

y_pred_grid = best_svc.predict(x_test)

Best Parameters: {'C': 10, 'gamma': 1, 'kernel': 'rbf'}


In [17]:
np.savetxt('predict_grid.csv', y_pred_grid, fmt='%s')

In [18]:
from sklearn.model_selection import cross_val_score

scores = cross_val_score(svc, x_train, y_train, cv=5)
print(f"Cross-Validation Accuracy: {np.mean(scores):.2f}%")

Cross-Validation Accuracy: 0.77%
