In [1]:
import pandas as pd

Add_on = pd.read_csv("/content/add_on_tweets.csv")

Add_on.head()

Unnamed: 0,username,text
0,Counter Foul,#aafaque_ehsens_ideamines_sites #ideamines_shs...
1,Daily #Nigeria #News,Buhari cancels planned trip to Niger http://dl...
2,Scoop Rocket News,2017 Porsche 718 Boxster Expert Review Car Rev...
3,Zbigniew Zarnoch,Unique #gift for this unique #time of the #yea...
4,Progressive Liberal,Morning Joe Panel Laughs Uproariously Over Lat...


In [0]:
Add_on = Add_on.dropna()

In [3]:
import nltk

nltk.download('stopwords')
nltk.download('wordnet')

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.
[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data]   Unzipping corpora/wordnet.zip.


True

In [0]:
import re
import sys
from nltk.stem.porter import PorterStemmer

porter_stemmer = PorterStemmer()

def preprocess_word(word):
    word = word.strip('\'"?!,.():;')
    word = re.sub(r'(.)\1+', r'\1\1', word)
    word = re.sub(r'(-|\')', '', word)
    return word

def is_valid_word(word):
    return (re.search(r'^[a-zA-Z][a-z0-9A-Z\._]*$', word) is not None)

def handle_emojis(tweet):
    tweet = re.sub(r'(:\s?\)|:-\)|\(\s?:|\(-:|:\'\))', ' EMO_POS ', tweet)
    tweet = re.sub(r'(:\s?D|:-D|x-?D|X-?D)', ' EMO_POS ', tweet)
    tweet = re.sub(r'(<3|:\*)', ' EMO_POS ', tweet)
    tweet = re.sub(r'(;-?\)|;-?D|\(-?;)', ' EMO_POS ', tweet)
    tweet = re.sub(r'(:\s?\(|:-\(|\)\s?:|\)-:)', ' EMO_NEG ', tweet)
    tweet = re.sub(r'(:,\(|:\'\(|:"\()', ' EMO_NEG ', tweet)
    return tweet

def preprocess_tweet(tweet):
    use_stemmer = True
    processed_tweet = []
    tweet = tweet.lower()
    tweet = re.sub(r'((www\.[\S]+)|(https?://[\S]+))', ' URL ', tweet)
    tweet = re.sub(r'@[\S]+', 'USER_MENTION', tweet)
    tweet = re.sub(r'#(\S+)', r' \1 ', tweet)
    tweet = re.sub(r'\brt\b', '', tweet)
    tweet = re.sub(r'\.{2,}', ' ', tweet)
    tweet = tweet.strip(' "\'')
    tweet = handle_emojis(tweet)
    tweet = re.sub(r'\s+', ' ', tweet)
    words = tweet.split()
    for word in words:
        word = preprocess_word(word)
        if is_valid_word(word):
            if use_stemmer:
                word = str(porter_stemmer.stem(word))
            processed_tweet.append(word)
    return ' '.join(processed_tweet)

In [0]:
from fastai import *
from fastai.text import *

In [0]:
import pandas as pd

Train = pd.read_csv("/content/train_2kmZucJ_text.csv")
Test = pd.read_csv("/content/test_oJQbWVk_text.csv")

Final = pd.concat([Train , Test , Add_on] , axis = 0 , sort =False)

In [0]:
Final = Final.fillna(0)

In [8]:
bs = 128

data_lm = (TextList.from_df(Final, path = './', cols='text')
    .split_by_rand_pct(0.1, seed=42)
    .label_for_lm()
    .databunch(bs=bs, num_workers=1 , backwards=True))

In [0]:
data_lm.save('data_lm_export.pkl')

In [0]:
data_lm = load_data('./', 'data_lm_export.pkl')

In [0]:
learn_lm = language_model_learner(data_lm, AWD_LSTM, drop_mult=1.0,pretrained=False)

In [0]:
lr = 1e-3
lr *= bs/48

In [16]:
learn_lm.fit_one_cycle(3, lr*10, moms=(0.8,0.7))

epoch,train_loss,valid_loss,accuracy,time
0,4.182594,3.915969,0.405859,01:22
1,3.956057,3.721489,0.417248,01:22
2,3.748915,3.607097,0.430361,01:22


In [19]:
#learn_lm.unfreeze()
learn_lm.fit_one_cycle(4, lr, moms=(0.8,0.7))

epoch,train_loss,valid_loss,accuracy,time
0,3.695668,3.565576,0.43547,01:22
1,3.584916,3.439889,0.452202,01:22
2,3.465086,3.35407,0.463208,01:22
3,3.401011,3.335362,0.465531,01:22


In [0]:
learn_lm.save('fine_tuned_bwd')
learn_lm.save_encoder('fine_tuned_enc_bwd')

In [21]:
data_clas = (TextList.from_df(Train, './', vocab=data_lm.vocab, cols='text')
    .split_by_rand_pct(0.1, seed=42)
    .label_from_df(cols='label')
    .databunch(bs=bs, num_workers=1 , backwards=True))

In [0]:
data_clas.save('data_clas_export.pkl')

In [0]:
data_clas = load_data('./', 'data_clas_export.pkl', bs=bs, num_workers=1 , backwards=True)

In [0]:
learn_cls = text_classifier_learner(data_clas, AWD_LSTM, drop_mult=0.5 , pretrained = False).to_fp16()
learn_cls.load_encoder('fine_tuned_enc_bwd')
learn_cls.freeze()

In [0]:
lr=2e-2
lr *= bs/48

In [28]:
learn_cls.fit_one_cycle(2, lr, moms=(0.8,0.7))

epoch,train_loss,valid_loss,accuracy,time
0,0.449466,0.337714,0.832071,00:02
1,0.447593,0.327416,0.861111,00:02


In [29]:
learn_cls.freeze_to(-2)
learn_cls.fit_one_cycle(2, slice(lr/(2.6**4),lr), moms=(0.8,0.7))

epoch,train_loss,valid_loss,accuracy,time
0,0.444618,0.402957,0.885101,00:03
1,0.416438,0.267484,0.887626,00:03


In [0]:
learn_cls.save("1st_bwd")

In [31]:
learn_cls.freeze_to(-3)
learn_cls.fit_one_cycle(2, slice(lr/2/(2.6**4),lr/2), moms=(0.8,0.7))

epoch,train_loss,valid_loss,accuracy,time
0,0.397266,0.362901,0.887626,00:05
1,0.389773,0.258802,0.89899,00:04


In [0]:
learn_cls.save("2nd_bwd")

In [33]:
learn_cls.unfreeze()
learn_cls.fit_one_cycle(8, slice(lr/4/(2.6**4),lr/10), moms=(0.8,0.7))

epoch,train_loss,valid_loss,accuracy,time
0,0.347176,0.244563,0.901515,00:06
1,0.321633,0.242692,0.909091,00:06
2,0.335961,0.239627,0.905303,00:06
3,0.327831,0.233464,0.910354,00:06
4,0.291839,0.229598,0.912879,00:06
5,0.279129,0.223357,0.907828,00:06
6,0.272937,0.227004,0.911616,00:06
7,0.262063,0.225915,0.911616,00:06


In [0]:
learn_cls.save("3rd_bwd")

**Creating Submit File**

In [0]:
learn_cls = learn_cls.load("3rd_bwd" , purge = False)

In [45]:
Tes_ = pd.read_csv("/content/test_oJQbWVk_text.csv")

Tes_.head()

Unnamed: 0,id,text
0,7921,I hate the new #iphone upgrade. Won't let me d...
1,7922,currently shitting my fucking pants. #apple #i...
2,7923,"I'd like to puts some CD-ROMS on my iPad, is t..."
3,7924,My ipod is officially dead. I lost all my pict...
4,7925,Been fighting iTunes all night! I only want th...


In [49]:
' '.join(Tes_.text[0].split(" ")[::-1])

"sucks #apple #ugh apps. download me let Won't upgrade. #iphone new the hate I"

In [50]:
from tqdm import tqdm

predictions = []

for i in tqdm(list(Tes_.text.values)):
    i = ' '.join(i.split(" ")[::-1])
    predictions.append(int(learn_cls.predict(i)[1]))

100%|██████████| 1953/1953 [01:47<00:00, 18.12it/s]


In [0]:
submission = {'id' : list(Tes_.id.values) , 'label' : predictions}

In [0]:
Df = pd.DataFrame.from_dict(submission)

In [0]:
Df.to_csv("submission.csv" , header = True , index = False)