In [2]:
import pandas as pd
from nltk.tokenize import TweetTokenizer
from sklearn.feature_extraction.text import TfidfVectorizer
from nltk.sentiment.vader import SentimentIntensityAnalyzer
from helpers import preprocessing_pipeline, count_syntactic_features


from sklearn.metrics import classification_report
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split

In [3]:
raw_training_data = pd.read_csv("data/train.csv")
training_data = preprocessing_pipeline(raw_training_data)


In [4]:
new_columns = pd.DataFrame(columns=["neg", "neu", "pos", "compound", 'Stopwords', 'Nouns', 'Verbs', 'Adverbs', 'Adjectives', 'Pronouns', "length"])
training_data = training_data.join(new_columns)
training_data.head()


Unnamed: 0,tweets,class,neg,neu,pos,compound,Stopwords,Nouns,Verbs,Adverbs,Adjectives,Pronouns,length
21238,fav moment in sepp blatter vid ( 0:20 ) : `` w...,1,,,,,,,,,,,
21239,just found this while walking my human ....,1,,,,,,,,,,,
21240,'disrespected the wife of prophet ' - pseudo l...,1,,,,,,,,,,,
21241,do you know that super yeay satisfying feeling...,1,,,,,,,,,,,
21242,if you 're going to call someone ignorant and ...,1,,,,,,,,,,,


In [5]:
for index, row in training_data.iterrows():
    tweet = row[0]
    scores = SentimentIntensityAnalyzer().polarity_scores(tweet)
    
    for sentiment, score in scores.items():
        training_data.loc[index, sentiment] = score
        
    syntax_counts = count_syntactic_features(tweet)
    for syntax, count in syntax_counts.items():
        training_data.loc[index, syntax] = count
        
    training_data.loc[index, "length"] = len(tweet)
    
training_data = training_data.drop_duplicates()
training_data = training_data.reset_index(drop=True)
training_data.head()

Unnamed: 0,tweets,class,neg,neu,pos,compound,Stopwords,Nouns,Verbs,Adverbs,Adjectives,Pronouns,length
0,fav moment in sepp blatter vid ( 0:20 ) : `` w...,1,0.0,0.778,0.222,0.6908,10,5,1,1,2,0,116
1,just found this while walking my human ....,1,0.0,1.0,0.0,0.0,4,2,2,0,0,0,43
2,'disrespected the wife of prophet ' - pseudo l...,1,0.217,0.652,0.13,-0.296,3,6,2,0,0,0,80
3,do you know that super yeay satisfying feeling...,1,0.0,0.704,0.296,0.8126,11,3,6,1,1,0,120
4,if you 're going to call someone ignorant and ...,1,0.234,0.766,0.0,-0.6705,9,3,4,1,3,0,104


In [6]:
tweet_tokenizer = TweetTokenizer()
def tokenize(tweet):
    return tweet_tokenizer.tokenize(tweet)
vectorizer = TfidfVectorizer(tokenizer=tokenize)
bag_of_words = vectorizer.fit_transform(training_data["tweets"])
bow_data = pd.DataFrame(bag_of_words.toarray(), columns=vectorizer.get_feature_names_out())
bow_data.head()

Unnamed: 0,‍‍,!,#,$,%,',(,),*,+,...,󾭞,󾮖,󾮗,󾮙,󾮝,󾮞,󾮟,󾰀,󾰑,󾰴
0,0.0,0.0,0.0,0.0,0.0,0.151897,0.149883,0.140782,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,0.0,0.0,0.0,0.0,0.0,0.172338,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,0.0,0.0,0.0,0.0,0.0,0.096672,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [7]:
y = training_data["class"]
processed_training_data = training_data.copy()
processed_training_data.drop(columns=["tweets", "class"], inplace=True)

In [8]:
X = pd.concat([bow_data, processed_training_data], axis=1)
X.head()

Unnamed: 0,‍‍,!,#,$,%,',(,),*,+,...,neu,pos,compound,Stopwords,Nouns,Verbs,Adverbs,Adjectives,Pronouns,length
0,0.0,0.0,0.0,0.0,0.0,0.151897,0.149883,0.140782,0.0,0.0,...,0.778,0.222,0.6908,10,5,1,1,2,0,116
1,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,1.0,0.0,0.0,4,2,2,0,0,0,43
2,0.0,0.0,0.0,0.0,0.0,0.172338,0.0,0.0,0.0,0.0,...,0.652,0.13,-0.296,3,6,2,0,0,0,80
3,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.704,0.296,0.8126,11,3,6,1,1,0,120
4,0.0,0.0,0.0,0.0,0.0,0.096672,0.0,0.0,0.0,0.0,...,0.766,0.0,-0.6705,9,3,4,1,3,0,104


In [9]:
X_train, X_test, y_train, y_test = train_test_split(X, y, 
                                                    test_size=0.3, 
                                                    random_state=1337)

model = LogisticRegression()
model.fit(X_train, y_train)
y_pred = model.predict(X_test)
print(classification_report(y_test, y_pred))

: 

: 

In [None]:
# bi_vectorizer = CountVectorizer(tokenizer=tokenize, 
#                                 ngram_range=(2, 2), 
#                                 max_features=50000)
# bigram = vectorizer.fit_transform(training_data["tweets"])
# bigram_data = pd.DataFrame(bag_of_words.toarray(), columns=vectorizer.get_feature_names_out())
# bigram_data.head()

In [None]:
# X2 = pd.concat([X, bigram_data], axis=1)

In [None]:
# X_train, X_test, y_train, y_test = train_test_split(X2, y, 
#                                                     test_size=0.3, 
#                                                     random_state=42)

# model = LogisticRegression(njobs=-1)
# model.fit(X_train, y_train)
# y_pred = model.predict(X_test)
# print(classification_report(y_test, y_pred))