In [1]:


import numpy as np
import pandas as pd
import sklearn



Load data

In [2]:
def load_tweets(file_path):    
    tweets = list()
    with open(file_path, 'r', encoding='utf-8') as preprocessed_tweets:
        for tweet in preprocessed_tweets :
            tweets.append(tweet.rstrip('\n'))          
    return tweets
    

In [3]:
    train_pos_tweets = load_tweets('processed_pos_tweets_non_transformer.txt')
    train_neg_tweets = load_tweets('processed_neg_tweets_non_transformer.txt')
    test_tweets = load_tweets('processed_test_tweets_non_transformer.txt')
    print("Tweets loaded")

Tweets loaded


Convert to Dataframe

In [4]:
    from sklearn.model_selection import train_test_split
    from sklearn.utils import shuffle
    seed =12222
    train_neg_labels = [0] * len(train_neg_tweets)
    train_pos_labels = [1] * len(train_pos_tweets)

    train_tweets = train_pos_tweets + train_neg_tweets
    train_labels = train_pos_labels + train_neg_labels 
    #Shuffle
    train_tweets, train_labels = shuffle(train_tweets, train_labels, random_state=10)
    data = pd.DataFrame({'tweet': train_tweets, 'label': train_labels})

    X = list(data["tweet"])
    y = list(data["label"])
    X_train, X_val, y_train, y_val = train_test_split(X, y, test_size=0.05,stratify=y,random_state=seed)
    X_test = pd.DataFrame({'tweet': test_tweets})
    


TD-IDF

In [5]:
from sklearn.preprocessing import StandardScaler
from sklearn.feature_extraction.text import TfidfVectorizer
X_test = list(X_test['tweet'])

vectorizer = TfidfVectorizer()
scaler =  StandardScaler(with_mean=False) # thus mean is set to zero

#Fit tdidf vectorizer
vectorizer = vectorizer.fit(X_train)

X_train = vectorizer.transform(X_train)
X_val = vectorizer.transform(X_val)
X_test = vectorizer.transform(X_test)

#Scale
scaler.fit(X_train)

X_train = scaler.transform(X_train)
X_val = scaler.transform(X_val)
X_test = scaler.transform(X_test)


Check dimesionality of transformed representation

In [6]:
print('X_train dimensions',X_train.shape)
print('X_val dimensions',X_val.shape)
print('X_test dimensions',X_test.shape)

X_train dimensions (2153743, 302425)
X_val dimensions (113355, 302425)
X_test dimensions (10000, 302425)


Linear models

In [43]:

from sklearn.svm import LinearSVC
model = LinearSVC(class_weight='balanced', random_state=seed,penalty='l2')



In [44]:
from sklearn.metrics import accuracy_score
model.fit(X_train, y_train)
y_pred = model.predict(X_val)
accuracy = accuracy_score(y_val, y_pred)
print("Validation accuracy:", accuracy)



Validation accuracy: 0.6780644876714745


In [55]:
y_test = model.predict(X_test)
y_preds = [-1 if val == 0 else 1 for val in y_test]
df = pd.DataFrame(y_preds, columns=["Prediction"])
df.index.name = "Id"
df.index += 1
df.to_csv("test_data_td_idf_svm.csv")

Non linear models


In [13]:
# from sklearn.ensemble import ExtraTreesClassifier
# model =  ExtraTreesClassifier(class_weight='balanced',random_state=seed)
from sklearn.ensemble import RandomForestClassifier
model = RandomForestClassifier(n_estimators=200, max_depth=11,random_state=seed,class_weight='balanced')


In [14]:
from sklearn.metrics import accuracy_score
model.fit(X_train, y_train)
y_pred = model.predict(X_val)
accuracy = accuracy_score(y_val, y_pred)
print("Validation accuracy:", accuracy)

In [11]:
from sklearn.metrics import accuracy_score
model.fit(X_train, y_train)
y_pred = model.predict(X_val)
accuracy = accuracy_score(y_val, y_pred)
print("Validation accuracy:", accuracy)

Validation accuracy: 0.7344007763221737


In [12]:
y_test = model.predict(X_test)
y_preds = [-1 if val == 0 else 1 for val in y_test]
df = pd.DataFrame(y_preds, columns=["Prediction"])
df.index.name = "Id"
df.index += 1
df.to_csv("test_data_td_idf_extra_tree.csv")