In [None]:
import numpy as np
import pandas as pd
import sklearn
from xgboost import XGBRFClassifier, XGBClassifier

In [None]:
def load_tweets(file_path):
    tweets = list()
    with open(file_path, 'r', encoding='utf-8') as preprocessed_tweets:
        for tweet in preprocessed_tweets :
            tweets.append(tweet.rstrip('\n'))
    return tweets

In [None]:
    train_pos_tweets = load_tweets('/content/processed_pos_tweets_non_transformer.txt')
    train_neg_tweets = load_tweets('/content/processed_neg_tweets_non_transformer.txt')
    test_tweets = load_tweets('/content/processed_test_tweets_non_transformer.txt')

    print("Tweets loaded")

Tweets loaded


In [None]:
    from sklearn.model_selection import train_test_split
    from sklearn.utils import shuffle
    seed =12222
    train_neg_labels = [0] * len(train_neg_tweets)
    train_pos_labels = [1] * len(train_pos_tweets)

    train_tweets = train_pos_tweets + train_neg_tweets
    train_labels = train_pos_labels + train_neg_labels
    #Shuffle
    train_tweets, train_labels = shuffle(train_tweets, train_labels, random_state=10)
    data = pd.DataFrame({'tweet': train_tweets, 'label': train_labels})

    X = list(data["tweet"])
    y = list(data["label"])
    X_train, X_val, y_train, y_val = train_test_split(X, y, test_size=0.05,stratify=y,random_state=seed)
    X_train_set = pd.DataFrame({'tweet': X_train, 'label': y_train})
    X_val_set = pd.DataFrame({'tweet': X_val, 'label': y_val})

    X_test_set = pd.DataFrame({'tweet': test_tweets})

In [None]:
!pip install gensim



In [None]:
import nltk
from nltk import WhitespaceTokenizer
tokenizer = WhitespaceTokenizer()
def tokenize(df):
  df['tweet']=df['tweet'].apply(lambda tweet: tokenizer.tokenize(tweet))

In [None]:
tokenize(X_train_set)
tokenize(X_val_set)
tokenize(X_test_set)

In [None]:
input = X_train_set['tweet'].tolist()

In [None]:
from gensim.models import Word2Vec
# define hyperparameters
model = Word2Vec(min_count=5,
                 sample=5e-5,
                     window=3,
                     vector_size=250,
                      alpha=0.035,
                     min_alpha=0.00075,
                     negative=5,
                     workers=4,
                 seed = seed)

In [None]:
model.build_vocab(input)

In [None]:
model.train(input,total_examples=model.corpus_count, epochs=25)

(211940308, 394880650)

In [None]:
model.wv.most_similar(positive=["smile"])


[('smiling', 0.608976423740387),
 ('face', 0.5537009835243225),
 ('frown', 0.5357383489608765),
 ('always', 0.5272393822669983),
 ('heart', 0.4652939736843109),
 ('person', 0.45580169558525085),
 ('make', 0.445931077003479),
 ('happy', 0.4452197253704071),
 ('love', 0.44057604670524597),
 ('beautiful', 0.43544161319732666)]

In [None]:
model.wv.most_similar(positive=[")"])


[('then', 0.9948365092277527),
 ('same', 0.9930252432823181),
 ('will', 0.9929379820823669),
 ('not', 0.9929072856903076),
 ('but', 0.9928439259529114),
 ('there', 0.9927449822425842),
 ("don't", 0.9926263689994812),
 ('rt', 0.9925832152366638),
 ("it's", 0.9920400977134705),
 ('why', 0.9918318390846252)]

In [None]:
model.wv.most_similar(positive=["("])


[('same', 0.9956627488136292),
 ("don't", 0.9945088624954224),
 ('here', 0.9943466782569885),
 ('who', 0.9941233396530151),
 ('that', 0.9940735697746277),
 ('at', 0.9939746856689453),
 ('out', 0.9939121007919312),
 ('im', 0.9938834309577942),
 ('but', 0.993812620639801),
 ('did', 0.9937651753425598)]

In [None]:
model.wv.most_similar(positive=["fun"])


[('enjoy', 0.4797041118144989),
 ('weekend', 0.47730129957199097),
 ('awesome', 0.45192423462867737),
 ('great', 0.4381538927555084),
 ('partying', 0.43800804018974304),
 ('good', 0.42641547322273254),
 ('tomorrow', 0.4135989248752594),
 ('going', 0.41259416937828064),
 ('funn', 0.4123479425907135),
 ('excited', 0.40758955478668213)]

In [None]:
size =250
def tweet_embeddings(tweet, size):
    vec = np.zeros(size).reshape((1, size))
    count = 0
    for word in tweet:
        # print(word)
        try:
            vec += model.wv[word].reshape((1, size))
            count += 1
        # throws KeyError if word not found
        except KeyError:
            continue
    # #normalize
    if count != 0:
        vec /= count
    return vec

In [None]:
transformed_X_train = np.concatenate([tweet_embeddings(z, size) for z in X_train_set['tweet']])
transformed_X_val = np.concatenate([tweet_embeddings(z, size) for z in X_val_set['tweet']])
transformed_X_test = np.concatenate([tweet_embeddings(z, size) for z in X_test_set['tweet']])

In [None]:
transformed_X_train.shape

(2153743, 250)

In [None]:
from sklearn.linear_model import SGDClassifier
from xgboost import XGBClassifier
from sklearn.ensemble import RandomForestClassifier
# model = RandomForestClassifier(n_estimators = 200,random_state=seed, max_depth = 10, min_samples_split = 30,verbose =True, n_jobs=-1)
# model = RandomForestClassifier(n_estimators=235, max_depth=11, verbose=True,n_jobs=-1,random_state=seed)
# model = SGDClassifier(loss="log", penalty="l1")
model_xgb = XGBClassifier(n_estimators = 1250, tree_method = "gpu_hist", objective='binary:logistic')

In [None]:
model_xgb.fit(transformed_X_train,y_train)

In [None]:
from sklearn.metrics import accuracy_score
y_pred = model_xgb.predict(transformed_X_val)
accuracy = accuracy_score(y_val, y_pred)
print("Validation accuracy:", accuracy)

Validation accuracy: 0.7932071809801068


In [None]:
y_test = model_xgb.predict(transformed_X_test)
y_preds = [-1 if val == 0 else 1 for val in y_test]
df = pd.DataFrame(y_preds, columns=["Prediction"])
df.index.name = "Id"
df.index += 1
df.to_csv("/content/drive/MyDrive/test_data_word2vec_final.csv")