In [1]:
import numpy as np
import pandas as pd

In [2]:
train_df = pd.read_csv('0000000000002747_training_twitter_x_y_train.csv')
test_df = pd.read_csv('0000000000002747_test_twitter_x_test.csv')

In [3]:
train_df.head()

Unnamed: 0,tweet_id,airline_sentiment,airline,airline_sentiment_gold,name,negativereason_gold,retweet_count,text,tweet_coord,tweet_created,tweet_location,user_timezone
0,567900433542488064,negative,Southwest,,ColeyGirouard,,0,"@SouthwestAir I am scheduled for the morning, ...",,2015-02-17 20:16:29 -0800,Washington D.C.,Atlantic Time (Canada)
1,569989168903819264,positive,Southwest,,WalterFaddoul,,0,@SouthwestAir seeing your workers time in and ...,,2015-02-23 14:36:22 -0800,"Indianapolis, Indiana; USA",Central Time (US & Canada)
2,568089179520954368,positive,United,,LocalKyle,,0,@united Flew ORD to Miami and back and had gr...,,2015-02-18 08:46:29 -0800,Illinois,Central Time (US & Canada)
3,568928195581513728,negative,Southwest,,amccarthy19,,0,@SouthwestAir @dultch97 that's horse radish 😤🐴,,2015-02-20 16:20:26 -0800,,Atlantic Time (Canada)
4,568594180014014464,negative,United,,J_Okayy,,0,@united so our flight into ORD was delayed bec...,,2015-02-19 18:13:11 -0800,,Eastern Time (US & Canada)


In [4]:
train_df.isnull().sum()

tweet_id                      0
airline_sentiment             0
airline                       0
airline_sentiment_gold    10949
name                          0
negativereason_gold       10956
retweet_count                 0
text                          0
tweet_coord               10204
tweet_created                 0
tweet_location             3550
user_timezone              3577
dtype: int64

In [5]:
train_df.columns

Index(['tweet_id', 'airline_sentiment', 'airline', 'airline_sentiment_gold',
       'name', 'negativereason_gold', 'retweet_count', 'text', 'tweet_coord',
       'tweet_created', 'tweet_location', 'user_timezone'],
      dtype='object')

In [6]:
test_df.columns

Index(['tweet_id', 'airline', 'airline_sentiment_gold', 'name',
       'negativereason_gold', 'retweet_count', 'text', 'tweet_coord',
       'tweet_created', 'tweet_location', 'user_timezone'],
      dtype='object')

In [7]:
train_df.drop(columns=['tweet_id','airline', 'airline_sentiment_gold',
       'name', 'negativereason_gold', 'retweet_count', 'tweet_coord',
       'tweet_created', 'tweet_location', 'user_timezone'],axis=1,inplace=True)

In [8]:
test_df.drop(columns=['tweet_id', 'airline', 'airline_sentiment_gold', 'name',
       'negativereason_gold', 'retweet_count','tweet_coord',
       'tweet_created', 'tweet_location', 'user_timezone'],axis=1,inplace=True)

In [9]:
train_df.head()

Unnamed: 0,airline_sentiment,text
0,negative,"@SouthwestAir I am scheduled for the morning, ..."
1,positive,@SouthwestAir seeing your workers time in and ...
2,positive,@united Flew ORD to Miami and back and had gr...
3,negative,@SouthwestAir @dultch97 that's horse radish 😤🐴
4,negative,@united so our flight into ORD was delayed bec...


In [10]:
test_df.head()

Unnamed: 0,text
0,@AmericanAir In car gng to DFW. Pulled over 1h...
1,"@AmericanAir after all, the plane didn’t land ..."
2,@SouthwestAir can't believe how many paying cu...
3,@USAirways I can legitimately say that I would...
4,@AmericanAir still no response from AA. great ...


In [11]:
import re
RE_EMOJI = re.compile('[\U00010000-\U0010ffff]', flags=re.UNICODE)
def remove_emoji(text):
    text = RE_EMOJI.sub(r'', text)
    text = re.sub(r'http\S+', '', text, flags=re.MULTILINE)
    return text

In [12]:
train_df['text'] = train_df['text'].apply(remove_emoji)
test_df['text'] = test_df['text'].apply(remove_emoji)

In [13]:
train_df.head()

Unnamed: 0,airline_sentiment,text
0,negative,"@SouthwestAir I am scheduled for the morning, ..."
1,positive,@SouthwestAir seeing your workers time in and ...
2,positive,@united Flew ORD to Miami and back and had gr...
3,negative,@SouthwestAir @dultch97 that's horse radish
4,negative,@united so our flight into ORD was delayed bec...


In [14]:
from gensim.parsing.preprocessing import preprocess_string
from gensim.parsing.preprocessing import strip_tags, strip_numeric, strip_punctuation, strip_multiple_whitespaces, stem_text
# Preprocessing functions
def preprocess_text(text):
    # Gensim preprocessing
    filters = [
        lambda x: x.lower(),           # Convert to lowercase
        strip_tags,                    # Remove HTML tags
        strip_numeric,                 # Remove numbers
        strip_punctuation,             # Remove punctuation
        strip_multiple_whitespaces     # Remove extra whitespaces     
    ]
    text = ' '.join(preprocess_string(text, filters=filters))
    return text

In [15]:
train_df['text'] = train_df['text'].apply(preprocess_text)
test_df['text'] = test_df['text'].apply(preprocess_text)

In [16]:
train_df.head()

Unnamed: 0,airline_sentiment,text
0,negative,southwestair i am scheduled for the morning da...
1,positive,southwestair seeing your workers time in and t...
2,positive,united flew ord to miami and back and had grea...
3,negative,southwestair dultch that s horse radish
4,negative,united so our flight into ord was delayed beca...


In [17]:
from nltk.stem import PorterStemmer
ps = PorterStemmer()

In [18]:
from spacy.tokenizer import Tokenizer
from spacy.lang.en import English

nlp = English()

# Create a blank Tokenizer with just the English vocab
tokenizer = Tokenizer(nlp.vocab)

In [19]:
def tokenize_stemming(text):
    tokens = tokenizer(text)

    # Convert the tokens to a list for better readability
    token_list = [token.text for token in tokens]
    stem_list = []
    for word in token_list:
        stem_list.append(ps.stem(word))
    return " ".join(stem_list)

In [20]:
test_df['text'] = test_df['text'].apply(tokenize_stemming)
train_df['text'] = train_df['text'].apply(tokenize_stemming)

In [21]:
from sklearn.feature_extraction.text import CountVectorizer
cv = CountVectorizer(stop_words='english',max_features=3000).fit(train_df['text'])

In [22]:
feature_names = cv.get_feature_names_out()

In [23]:
print(feature_names[:20])

['aa' 'aadvantag' 'abandon' 'abc' 'abil' 'abl' 'aboard' 'abov' 'abq'
 'absolut' 'absurd' 'abt' 'abus' 'abysm' 'ac' 'accept' 'access' 'accid'
 'accident' 'accommod']


In [24]:
x_train = cv.fit_transform(train_df['text']).toarray()
y_train = train_df['airline_sentiment']

In [25]:
from sklearn.naive_bayes import MultinomialNB
mb = MultinomialNB()

In [26]:
mb.fit(x_train,y_train)

In [27]:
x_test = cv.transform(test_df['text']).toarray()

In [28]:
y_pred = mb.predict(x_test)

In [34]:
df = pd.DataFrame(y_pred)

In [36]:
df.to_csv("Y_predict.csv",header=False,index=False)