In [None]:

import numpy as np
import pandas as pd
import sklearn


In [None]:
!pip install fasttext



Load preprocessed data


In [None]:
def load_tweets(file_path):
    tweets = list()
    with open(file_path, 'r', encoding='utf-8') as preprocessed_tweets:
        for tweet in preprocessed_tweets :
            tweets.append(tweet.rstrip('\n'))
    return tweets

In [None]:
    train_pos_tweets = load_tweets('/content/processed_pos_tweets_non_transformer.txt')
    train_neg_tweets = load_tweets('/content/processed_neg_tweets_non_transformer.txt')
    test_tweets = load_tweets('/content/processed_test_tweets_non_transformer.txt')
    print("Tweets loaded")

Tweets loaded


Convert to Dataframe

In [None]:
    from sklearn.model_selection import train_test_split
    from sklearn.utils import shuffle
    seed =12222
    train_neg_labels = [0] * len(train_neg_tweets)
    train_pos_labels = [1] * len(train_pos_tweets)

    train_tweets = train_pos_tweets + train_neg_tweets
    train_labels = train_pos_labels + train_neg_labels
    #Shuffle
    train_tweets, train_labels = shuffle(train_tweets, train_labels, random_state=10)
    data = pd.DataFrame({'tweet': train_tweets, 'label': train_labels})

    X = list(data["tweet"])
    y = list(data["label"])
    X_train, X_val, y_train, y_val = train_test_split(X, y, test_size=0.05,stratify=y,random_state=seed)
    X_train_set = pd.DataFrame({'tweet': X_train, 'label': y_train})
    X_val_set = pd.DataFrame({'tweet': X_val, 'label': y_val})

    X_test_set = pd.DataFrame({'tweet': test_tweets})


Transform data into format expected by fasttext

In [None]:
train_file = '/content/train_data.txt'
val_file = '/content/val_data.txt'
test_file = '/content/test_data.txt'

with open(train_file, 'w') as f:
    for index, row in X_train_set.iterrows():
        f.write('__label__{} {}\n'.format(row['label'], row['tweet']))

with open(val_file, 'w') as f:
    for index, row in X_val_set.iterrows():
        f.write('{}\n'.format(row['tweet']))

with open(test_file, 'w') as f:
    for index, row in X_test_set.iterrows():
        f.write('{}\n'.format(row['tweet']))

Train fasttext


In [None]:
import fasttext
model = fasttext.train_supervised(input=train_file, lr=0.01, dim=150, epoch=20,seed=seed)

Evaluation on validation set

In [None]:
with open(val_file, 'w') as f:
    for index, row in X_val_set.iterrows():
        f.write('{}\n'.format(row['tweet']))

In [None]:
with open('/content/val_data.txt') as f:
    val_data = [line.strip() for line in f]

# Create predictions for the test data
labels = [model.predict(text)[0][0].replace('__label__', '') for text in val_data]

In [None]:
from sklearn.metrics import accuracy_score
y_preds = [0 if val == '0' else 1 for val in labels]
accuracy = accuracy_score(list(y_val), y_preds)
print('Validation Accuracy:', accuracy)

Validation Accuracy: 0.8018685694011906


Generating predictions for test set

In [None]:
with open('/content/test_data.txt') as f:
    test_data = [line.strip() for line in f]
labels = [model.predict(text)[0][0].replace('__label__', '') for text in test_data]
y_preds = [-1 if val == '0' else 1 for val in labels]
df = pd.DataFrame(y_preds, columns=["Prediction"])
df.index.name = "Id"
df.index += 1
df.to_csv("/content/test_data_fastText.csv")