In [None]:
import numpy as np
import pandas as pd

from scipy.sparse import csr_matrix, hstack
from nltk.tokenize import TweetTokenizer
from sklearn.feature_extraction.text import TfidfVectorizer
from nltk.sentiment.vader import SentimentIntensityAnalyzer
from helpers import preprocessing_pipeline, count_syntactic_features

from sklearn.ensemble import RandomForestClassifier
from sklearn.svm import LinearSVC
from sklearn.metrics import classification_report
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split

In [None]:
raw_training_data = pd.read_csv("data/train.csv")
training_data = preprocessing_pipeline(raw_training_data)

In [None]:
new_columns = pd.DataFrame(columns=["neg", "neu", "pos", "compound", 'Stopwords', 'Nouns', 'Verbs', 'Adverbs', 'Adjectives', 'Pronouns', "length"])
training_data = training_data.join(new_columns)

In [None]:
try: 
    training_data = pd.read_csv("training_data.csv")
    training_data.head()
    
except FileNotFoundError as error:
    for index, row in training_data.iterrows():
        tweet = row[0]
        
    scores = SentimentIntensityAnalyzer().polarity_scores(tweet)
    
    for sentiment, score in scores.items():
        training_data.loc[index, sentiment] = score
        
    syntax_counts = count_syntactic_features(tweet)
    for syntax, count in syntax_counts.items():
        training_data.loc[index, syntax] = count
        
    training_data.loc[index, "length"] = len(tweet)
    
    training_data = training_data.drop_duplicates()
    training_data = training_data.reset_index(drop=True)
    training_data.to_csv("training_data.csv")
    training_data.head()

In [None]:
tweet_tokenizer = TweetTokenizer()
def tokenize(tweet):
    return tweet_tokenizer.tokenize(tweet)
vectorizer = TfidfVectorizer(tokenizer=tokenize)
bag_of_words = vectorizer.fit_transform(training_data["tweets"])

In [None]:
processed_columns = processed_training_data.columns
y = training_data["class"]
processed_training_data = training_data.copy()
processed_training_data.drop(columns=["tweets", "class"], inplace=True)
sparse_training_data = csr_matrix(processed_training_data.to_numpy(dtype=np.float32))

In [None]:
X = hstack([bag_of_words, sparse_training_data])

In [None]:
X_train, X_test, y_train, y_test = train_test_split(X, y, 
                                                    test_size=0.3, 
                                                    random_state=1337)

model = LogisticRegression()
model.fit(X_train, y_train)
y_pred = model.predict(X_test)
print(classification_report(y_test, y_pred))

In [None]:
model2 = LinearSVC()
model2.fit(X_train, y_train)
y_pred = model2.predict(X_test)
print(classification_report(y_test, y_pred))

In [None]:
model3 = RandomForestClassifier()
model3.fit(X_train, y_train)
y_pred = model3.predict(X_test)
print(classification_report(y_test, y_pred))

In [None]:
# bi_vectorizer = CountVectorizer(tokenizer=tokenize, 
#                                 ngram_range=(2, 2), 
#                                 max_features=50000)
# bigram = vectorizer.fit_transform(training_data["tweets"])
# bigram_data = pd.DataFrame(bag_of_words.toarray(), columns=vectorizer.get_feature_names_out())
# bigram_data.head()

In [None]:
# X2 = pd.concat([X, bigram_data], axis=1)

In [None]:
# X_train, X_test, y_train, y_test = train_test_split(X2, y, 
#                                                     test_size=0.3, 
#                                                     random_state=42)

# model = LogisticRegression(njobs=-1)
# model.fit(X_train, y_train)
# y_pred = model.predict(X_test)
# print(classification_report(y_test, y_pred))

In [None]:
y_pred = model3.predict(X_test)
print(classification_report(y_test, y_pred))