In [1]:
import pandas as pd
from nltk.tokenize import TweetTokenizer
from sklearn.feature_extraction.text import CountVectorizer
from nltk.sentiment.vader import SentimentIntensityAnalyzer
from helpers import preprocessing_pipeline, count_syntactic_features

In [2]:
raw_train_set = pd.read_csv("data/train.csv")
train_set = preprocessing_pipeline(raw_train_set)


In [3]:
new_columns = pd.DataFrame(columns=["neg", "neu", "pos", "compound", 'Stopwords', 'Nouns', 'Verbs', 'Adverbs', 'Adjectives', 'Pronouns', "length"])
train_set = train_set.join(new_columns)
train_set.head()


Unnamed: 0,tweets,class,neg,neu,pos,compound,Stopwords,Nouns,Verbs,Adverbs,Adjectives,Pronouns,length
21238,fav moment in sepp blatter vid ( 0:20 ) : `` w...,1,,,,,,,,,,,
21239,just found this while walking my human ....,1,,,,,,,,,,,
21240,'disrespected the wife of prophet ' - pseudo l...,1,,,,,,,,,,,
21241,do you know that super yeay satisfying feeling...,1,,,,,,,,,,,
21242,if you 're going to call someone ignorant and ...,1,,,,,,,,,,,


In [4]:
for index, row in train_set.iterrows():
    tweet = row[0]
    scores = SentimentIntensityAnalyzer().polarity_scores(tweet)
    
    for sentiment, score in scores.items():
        train_set.loc[index, sentiment] = score
        
    syntax_counts = count_syntactic_features(tweet)
    for syntax, count in syntax_counts.items():
        train_set.loc[index, syntax] = count
        
    train_set.loc[index, "length"] = len(tweet)
    
train_set = train_set.drop_duplicates()
train_set = train_set.reset_index(drop=True)
train_set.head()

Unnamed: 0,tweets,class,neg,neu,pos,compound,Stopwords,Nouns,Verbs,Adverbs,Adjectives,Pronouns,length
0,fav moment in sepp blatter vid ( 0:20 ) : `` w...,1,0.0,0.778,0.222,0.6908,10,5,1,1,2,0,116
1,just found this while walking my human ....,1,0.0,1.0,0.0,0.0,4,2,2,0,0,0,43
2,'disrespected the wife of prophet ' - pseudo l...,1,0.217,0.652,0.13,-0.296,3,6,2,0,0,0,80
3,do you know that super yeay satisfying feeling...,1,0.0,0.704,0.296,0.8126,11,3,6,1,1,0,120
4,if you 're going to call someone ignorant and ...,1,0.234,0.766,0.0,-0.6705,9,3,4,1,3,0,104


In [7]:
tweet_tokenizer = TweetTokenizer()
def tokenize(tweet):
    return tweet_tokenizer.tokenize(tweet)
vectorizer = CountVectorizer(tokenizer=tokenize)
bag_of_words = vectorizer.fit_transform(train_set["tweets"])
bow_data = pd.DataFrame(bag_of_words.toarray(), columns=vectorizer.get_feature_names_out())
bow_data.head()

Unnamed: 0,‍‍,!,#,$,%,',(,),*,+,...,󾭞,󾮖,󾮗,󾮙,󾮝,󾮞,󾮟,󾰀,󾰑,󾰴
0,0,0,0,0,0,2,1,1,0,0,...,0,0,0,0,0,0,0,0,0,0
1,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,0,0,0,0,0,2,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,0,0,0,0,0,1,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [8]:
y = train_set["class"]
train_set.drop(columns=["tweets", "class"], inplace=True)

In [26]:
X = pd.concat([bow_data, train_set], axis=1)
X.head()

Unnamed: 0,‍‍,!,#,$,%,',(,),*,+,...,neu,pos,compound,Stopwords,Nouns,Verbs,Adverbs,Adjectives,Pronouns,length
0,0,0,0,0,0,2,1,1,0,0,...,0.778,0.222,0.6908,10,5,1,1,2,0,116
1,0,0,0,0,0,0,0,0,0,0,...,1.0,0.0,0.0,4,2,2,0,0,0,43
2,0,0,0,0,0,2,0,0,0,0,...,0.652,0.13,-0.296,3,6,2,0,0,0,80
3,0,0,0,0,0,0,0,0,0,0,...,0.704,0.296,0.8126,11,3,6,1,1,0,120
4,0,0,0,0,0,1,0,0,0,0,...,0.766,0.0,-0.6705,9,3,4,1,3,0,104
