# FastText - Project 1 - COMP90051 Statistical Machine Learning

### Group: Kelloggs
Team Members: Dean Pakravan, David Watson, Aaron Qiu

In [None]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import csv
from textblob import TextBlob
sns.set_style('darkgrid')
%matplotlib inline

print('Import libraries successful!')

# TO BE RUN ONLY ONCE - If it is the first time

The following three cells take in the training data set provided on kaggle. Each tweet author id is attached with a prefix '__label__' . This is for fasttext to recognize the author id's as labels. FastText can alter what the prefix should be if the user desires. You will need to uncomment the following three cells if it is the first time running on your personal machine.

In [None]:
# def readLabelData(path):
#     fo = open(path, "r", encoding="utf8")
#     data = fo.readlines();
#     fo.close()
#     return data

# data = readLabelData('train_tweets.txt')
# print(data[0])

In [None]:
# to attach __label__ to the author

# for i in range(len(data)):
#     data[i] = "__label__" + data[i]

# print(data[0])

In [None]:
# def writeLabel(data):
#     f= open("train_tweet_label.txt","w+", encoding="utf8")
#     for i in range(len(data)):
#         f.write(data[i]) 
#     return

# writeLabel(data)

# The remaining code can be run as normal

In [None]:
# Read in our already labeled data
def readLabelData(path):
    fo = open(path, "r", encoding="utf8")
    data = fo.readlines();
    fo.close()
    return data

data = readLabelData('train_tweet_label.txt')
print(data[0])

# Remove RE-TWEET

The following two cells both remove retweets successfully. The first cell was the first created and performs extremely slow compared to the second one. Kept for documentation.

In [None]:
# Checks if the first letters are RT

# def remRetweet(data):
#     arr = []
#     for i in range(len(data)):
#         word = (data[i].split('\t'))
#         # Ignore empty tweets
#         if (len(word) >= 2):
#             word = word[1]
#             # Ignore tweets with less than 2 charecters
#             if (len(word) >= 2):
#                 word = word[:2]
#                 if (word == 'RT'):
#                     arr.append(i)
#     print(len(arr))            
#     for j in range(len(arr)):
#         index = arr[j] - j
#         del data[index]
#     return data

# print(len(data))
# data = remRetweet(data)
# print(len(data))
            

In [None]:
print(len(data))
# Remove tweets, assumes RT is at the start of the tweet
data = [i for i in data if not ('\tRT' in i )]
print(len(data))

In [None]:
# Quick check after removing retweet
print(data[0])

# Pre-proceses the data

In [None]:
# Run if the first time
pip install emoji

In [None]:
import re
import itertools
import emoji

for i in range(len(data)):
    # Remove @ and # (+|(#[A-Za-z0-9]+))
    data[i] = ' '.join(re.sub("(@[A-Za-z0-9]+|(#[A-Za-z0-9]+))", " ", data[i]).split())
    # Remove punctuation
    data[i] = ' '.join(re.sub("[\.\,\!\?\:\;\-\=]", " ", data[i]).split())
    # Lowercase
    data[i] = data[i].lower()
    # Fix misspell words
    data[i] = ''.join(''.join(s)[:2] for _, s in itertools.groupby(data[i]))   
    #Part for emojis
#     data[i] = emoji.demojize(data[i])
print(data[0])

# Save after pre-processing to avoid repetition

The following rows are commented out as the pre-processing stage constantly changed throughout testing. It is written purely for documentation.

In [None]:
# def writePred(data):
#     f= open("train_tweet_label_pred.txt","w+", encoding="utf-8")
#     for i in range(len(data)):
#         f.write(data[i] + "\n") 
#     return

# writePred(data)

In [None]:
# open pre-processed data
# def openPred(path):
#     fo = open(path, "r", endocidng = "utf-8")
#     data = fo.readlines();
#     fo.close()
#     return data

# data = openPred("train_tweet_label_pred.txt")

# Sample it and save it

In [None]:
# Sample the data
# Currently set at 10,000 rows, altered for testing.
dataSamp = data[:10000]
print(len(dataSamp))

def writeSamp(dataSamp):
    f= open("train_tweet_label_samp.txt","w+", encoding="utf-8")
    for i in range(len(dataSamp)):
        f.write(dataSamp[i] + "\n") 
    return

writeSamp(dataSamp)

In [None]:
# TO SPLIT THE DATA SET AND WRITE TO A FILE
def shuffle_split(infilename, outfilename1, outfilename2):
    from random import shuffle

    with open(infilename, 'r', encoding = "utf8") as f:
        lines = f.readlines()
    shuffle(lines)
    # append a newline in case the last line didn't end with one
    lines[-1] = lines[-1].rstrip('\n') + '\n'
    traingdata = len(lines)* 90 // 100
    with open(outfilename1, 'w',encoding = "utf8") as f:
        f.writelines(lines[:traingdata])
    with open(outfilename2, 'w',encoding = "utf8") as f:
        f.writelines(lines[traingdata + 1:])

shuffle_split('train_tweet_label_samp.txt', 'train_tweet_BIG.txt','train_tweet_valid.txt')

# Apply fasttext

In [None]:
import fasttext
# Vary the hyper-parameters
hyper_params = {"lr": 1,
    "epoch": 5,
    "wordNgrams": 2, # this is the best
    "dim": 5,
    "loss": 'softmax'}     
        
# Train the model.
model = fasttext.train_supervised('train_tweet_BIG.txt', **hyper_params)
print("Model trained with the hyperparameter \n {}".format(hyper_params))

In [None]:
# CHECK PERFORMANCE      
result = model.test('train_tweet_BIG.txt')
validation = model.test('train_tweet_valid.txt')
        
# DISPLAY ACCURACY OF TRAINED MODEL
text_line = str(hyper_params) + ",accuracy:" + str(result[1])  + ",validation:" + str(validation[1]) + '\n' 
print(text_line)

In [None]:
# If you wish to save the model
model.save_model("model_filename.ftz")

In [None]:
# To load any previous model - only good if we want to retest some test data
model = fasttext.load_model("model_filename.ftz")

# To test on the unlabelled data

In [None]:
# Read in our test data
def readLabelData(path):
    fo = open(path, "r", encoding="utf8")
    data = fo.readlines();
    fo.close()
    return data

unlabeledData = readLabelData('test_tweets_unlabeled.txt')
print(unlabeledData[:2])

In [None]:
# Remove \n
for i in range(len(unlabeledData)):
    unlabeledData[i] = ' '.join(re.sub("[\n]", " ", unlabeledData[i]).split())
unlabeledData[:2]  

In [None]:
# pre process test data
import re
import itertools

def pre_process_test(data):
    for i in range(len(data)):
        # Remove @ and #
        data[i] = ' '.join(re.sub("(@[A-Za-z0-9]+|(#[A-Za-z0-9]+))", " ", data[i]).split())
        # Remove punctuation
        data[i] = ' '.join(re.sub("[\.\,\!\?\:\;\-\=]", " ", data[i]).split())
        # Lowercase
        data[i] = data[i].lower()
        # Fix misspell words
        data[i] = ''.join(''.join(s)[:2] for _, s in itertools.groupby(data[i]))
    return data
 
unlabeledData = pre_process_test(unlabeledData)

print(unlabeledData[0])

In [None]:
# Use the trained model to predict on the unlabeled data
final = model.predict(unlabeledData,k=1)

In [None]:
# Write the submission text file
with open("submission.txt", "w") as f:
    f.write('Id,Predicted\n')
    index = 0
    for i in range(len(final[0])):
        index += 1
        text = ' '.join(re.sub("__label__", " ", final[0][i][0]).split())
        f.write(str(index) + ',' + text + '\n')