**Analyzing Add on Data**

In [1]:
import pandas as pd

Add_on = pd.read_csv("/content/add_on_tweets.csv")

Add_on.head()

Unnamed: 0,username,text
0,Counter Foul,#aafaque_ehsens_ideamines_sites #ideamines_shs...
1,Daily #Nigeria #News,Buhari cancels planned trip to Niger http://dl...
2,Scoop Rocket News,2017 Porsche 718 Boxster Expert Review Car Rev...
3,Zbigniew Zarnoch,Unique #gift for this unique #time of the #yea...
4,Progressive Liberal,Morning Joe Panel Laughs Uproariously Over Lat...


In [2]:
Add_on.isnull().any()

username    False
text         True
dtype: bool

In [3]:
Add_on.isnull().sum()

username    0
text        4
dtype: int64

In [4]:
Add_on.shape

(40694, 2)

There are only 4 null values , we can drop the whole row with no issues

In [0]:
Add_on = Add_on.dropna()

In [6]:
Add_on.shape  # and GONE !!!!

(40690, 2)

**Text Processing**

In [7]:
import nltk

nltk.download('stopwords')
nltk.download('wordnet')

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


True

In [0]:
import re
import sys
from nltk.stem.porter import PorterStemmer

porter_stemmer = PorterStemmer()

def preprocess_word(word):
    word = word.strip('\'"?!,.():;')
    word = re.sub(r'(.)\1+', r'\1\1', word)
    word = re.sub(r'(-|\')', '', word)
    return word

def is_valid_word(word):
    return (re.search(r'^[a-zA-Z][a-z0-9A-Z\._]*$', word) is not None)

def handle_emojis(tweet):
    tweet = re.sub(r'(:\s?\)|:-\)|\(\s?:|\(-:|:\'\))', ' EMO_POS ', tweet)
    tweet = re.sub(r'(:\s?D|:-D|x-?D|X-?D)', ' EMO_POS ', tweet)
    tweet = re.sub(r'(<3|:\*)', ' EMO_POS ', tweet)
    tweet = re.sub(r'(;-?\)|;-?D|\(-?;)', ' EMO_POS ', tweet)
    tweet = re.sub(r'(:\s?\(|:-\(|\)\s?:|\)-:)', ' EMO_NEG ', tweet)
    tweet = re.sub(r'(:,\(|:\'\(|:"\()', ' EMO_NEG ', tweet)
    return tweet

def preprocess_tweet(tweet):
    use_stemmer = True
    processed_tweet = []
    tweet = tweet.lower()
    tweet = re.sub(r'((www\.[\S]+)|(https?://[\S]+))', ' URL ', tweet)
    tweet = re.sub(r'@[\S]+', 'USER_MENTION', tweet)
    tweet = re.sub(r'#(\S+)', r' \1 ', tweet)
    tweet = re.sub(r'\brt\b', '', tweet)
    tweet = re.sub(r'\.{2,}', ' ', tweet)
    tweet = tweet.strip(' "\'')
    tweet = handle_emojis(tweet)
    tweet = re.sub(r'\s+', ' ', tweet)
    words = tweet.split()
    for word in words:
        word = preprocess_word(word)
        if is_valid_word(word):
            if use_stemmer:
                word = str(porter_stemmer.stem(word))
            processed_tweet.append(word)
    return ' '.join(processed_tweet)

puncts = [',', '.', '"', ':', ')', '(', '-', '!', '?', '|', ';', "'", '$', '&', '/', '[', ']', '>', '%', '=', '#', '*', '+', '\\', '•',  '~', '@', '£',
 '·', '_', '{', '}', '©', '^', '®', '`',  '<', '→', '°', '€', '™', '›',  '♥', '←', '×', '§', '″', '′', 'Â', '█', '½', 'à', '…', '\n', '\xa0', '\t',
 '“', '★', '”', '–', '●', 'â', '►', '−', '¢', '²', '¬', '░', '¶', '↑', '±', '¿', '▾', '═', '¦', '║', '―', '¥', '▓', '—', '‹', '─', '\u3000', '\u202f',
 '▒', '：', '¼', '⊕', '▼', '▪', '†', '■', '’', '▀', '¨', '▄', '♫', '☆', 'é', '¯', '♦', '¤', '▲', 'è', '¸', '¾', 'Ã', '⋅', '‘', '∞', '«',
 '∙', '）', '↓', '、', '│', '（', '»', '，', '♪', '╩', '╚', '³', '・', '╦', '╣', '╔', '╗', '▬', '❤', 'ï', 'Ø', '¹', '≤', '‡', '√', ]
mispell_dict = {"aren't" : "are not",
    "can't" : "cannot",
    "couldn't" : "could not",
    "couldnt" : "could not",
    "didn't" : "did not",
    "doesn't" : "does not",
    "doesnt" : "does not",
    "don't" : "do not",
    "hadn't" : "had not",
    "hasn't" : "has not",
    "haven't" : "have not",
    "havent" : "have not",
    "he'd" : "he would",
    "he'll" : "he will",
    "he's" : "he is",
    "i'd" : "I would",
    "i'd" : "I had",
    "i'll" : "I will",
    "i'm" : "I am",
    "isn't" : "is not",
    "it's" : "it is",
    "it'll":"it will",
    "i've" : "I have",
    "let's" : "let us",
    "mightn't" : "might not",
    "mustn't" : "must not",
    "shan't" : "shall not",
    "she'd" : "she would",
    "she'll" : "she will",
    "she's" : "she is",
    "shouldn't" : "should not",
    "shouldnt" : "should not",
    "that's" : "that is",
    "thats" : "that is",
    "there's" : "there is",
    "theres" : "there is",
    "they'd" : "they would",
    "they'll" : "they will",
    "they're" : "they are",
    "theyre":  "they are",
    "they've" : "they have",
    "we'd" : "we would",
    "we're" : "we are",
    "weren't" : "were not",
    "we've" : "we have",
    "what'll" : "what will",
    "what're" : "what are",
    "what's" : "what is",
    "what've" : "what have",
    "where's" : "where is",
    "who'd" : "who would",
    "who'll" : "who will",
    "who're" : "who are",
    "who's" : "who is",
    "who've" : "who have",
    "won't" : "will not",
    "wouldn't" : "would not",
    "you'd" : "you would",
    "you'll" : "you will",
    "you're" : "you are",
    "you've" : "you have",
    "'re": " are",
    "wasn't": "was not",
    "we'll":" will",
    "didn't": "did not",
    "tryin'":"trying"
    }


def clean_text(x):
    x = str(x)
    for punct in puncts:
        x = x.replace(punct, f' {punct} ')
    return x


def clean_numbers(x):
    x = re.sub('[0-9]{5,}', '#####', x)
    x = re.sub('[0-9]{4}', '####', x)
    x = re.sub('[0-9]{3}', '###', x)
    x = re.sub('[0-9]{2}', '##', x)
    return x


def _get_mispell(mispell_dict):
    mispell_re = re.compile('(%s)' % '|'.join(mispell_dict.keys()))
    return mispell_dict, mispell_re


def replace_typical_misspell(text):
    mispellings, mispellings_re = _get_mispell(mispell_dict)

    def replace(match):
        return mispellings[match.group(0)]

    return mispellings_re.sub(replace, text)


def clean_data(df, columns: list):
    for col in columns:
        df[col] = df[col].apply(lambda x: clean_numbers(x))
        df[col] = df[col].apply(lambda x: clean_text(x.lower()))
        df[col] = df[col].apply(lambda x: replace_typical_misspell(x))

    return df

**Bulding Language Model**

In [0]:
from fastai import *
from fastai.text import *

In [0]:
import pandas as pd

Train = pd.read_csv("/content/train_2kmZucJ_text.csv")
Test = pd.read_csv("/content/test_oJQbWVk_text.csv")

Final = pd.concat([Train , Test , Add_on] , axis = 0 , sort =False)

In [11]:
Final.head()

Unnamed: 0,id,label,text,username
0,1.0,0.0,#fingerprint #Pregnancy Test https://goo.gl/h1...,
1,2.0,0.0,Finally a transparant silicon case ^^ Thanks t...,
2,3.0,0.0,We love this! Would you go? #talk #makememorie...,
3,4.0,0.0,I'm wired I know I'm George I was made that wa...,
4,5.0,1.0,What amazing service! Apple won't even talk to...,


In [12]:
Final.tail()

Unnamed: 0,id,label,text,username
40689,,,Suivez la suite de la 9e journée du #CHFP2019 ...,FootKole
40690,,,WIN 2 Odds Daily & 1.5+ Odds Daily\nJoin us- h...,Featgames
40691,,,WIN 2 Odds Daily & 1.5+ Odds Daily\nJoin us- h...,Featgames
40692,,,"So, not a fan of ESPN sports app for my go to ...",Bobby Sero
40693,,,#HBD #TonyDorsett #formerNFL #Football #RB #Co...,Da'Game


In [0]:
Final = Final.fillna(0)

In [18]:
%time Final['text'] = Final['text'].apply(lambda x : preprocess_tweet(x))

CPU times: user 20.6 s, sys: 14 ms, total: 20.7 s
Wall time: 20.7 s


In [0]:
Final = clean_data(Final , ['text'])

In [20]:
Final.shape

(50563, 4)

In [21]:
Final.head()

Unnamed: 0,id,label,text,username
0,1.0,0.0,fingerprint pregnanc test url android app beau...,0
1,2.0,0.0,final a transpar silicon case thank to my uncl...,0
2,3.0,0.0,we love thi would you go talk makememori unplu...,0
3,4.0,0.0,im wire i know im georg i wa made that way emo...,0
4,5.0,1.0,what amaz servic appl wont even talk to me abo...,0


In [0]:
#, label_cols = 'label'

#data_lm = TextLMDataBunch.from_df(path = './', train_df = Final , valid_df = Final , text_cols = 'text')

In [23]:
bs = 128

data_lm = (TextList.from_df(Final, path = './', cols='text')
    .split_by_rand_pct(0.1, seed=42)
    .label_for_lm()
    .databunch(bs=bs, num_workers=1))

In [0]:
data_lm.save('data_lm_export.pkl')

In [0]:
data_lm = load_data('./', 'data_lm_export.pkl')

In [0]:
learn_lm = language_model_learner(data_lm, AWD_LSTM, drop_mult=1.0)

In [0]:
lr = 1e-3
lr *= bs/48

In [28]:
learn_lm.fit_one_cycle(3, lr*10, moms=(0.8,0.7))

epoch,train_loss,valid_loss,accuracy,time
0,5.845404,5.263926,0.2262,00:26
1,5.326172,4.931391,0.251507,00:26
2,4.937075,4.822216,0.261058,00:26


In [29]:
learn_lm.unfreeze()
learn_lm.fit_one_cycle(4, lr, moms=(0.8,0.7))

epoch,train_loss,valid_loss,accuracy,time
0,4.629325,4.658884,0.280069,00:32
1,4.426744,4.453678,0.307561,00:32
2,4.13458,4.347,0.323214,00:32
3,3.927328,4.334609,0.325465,00:32


In [0]:
learn_lm.save('fine_tuned_fwd')
learn_lm.save_encoder('fine_tuned_enc_fwd')

**Building Classifier**

In [0]:
#Train['text'] = Train['text'].apply(lambda x : preprocess_tweet(x))

In [31]:
data_clas = (TextList.from_df(Train, './', vocab=data_lm.vocab, cols='text')
    .split_by_rand_pct(0.1, seed=42)
    .label_from_df(cols='label')
    .databunch(bs=bs, num_workers=1))

In [0]:
data_clas.vocab.itos = data_lm.vocab.itos

In [0]:
data_clas.save('data_clas_export.pkl')

In [0]:
data_clas = load_data('./', 'data_clas_export.pkl', bs=bs, num_workers=1)

In [0]:
learn_cls = text_classifier_learner(data_clas, AWD_LSTM, drop_mult=0.5).to_fp16()
learn_cls.load_encoder('fine_tuned_enc_fwd')
learn_cls.freeze()

In [0]:
lr=2e-2
lr *= bs/48

In [37]:
learn_cls.fit_one_cycle(2, lr, moms=(0.8,0.7))

epoch,train_loss,valid_loss,accuracy,time
0,0.460814,0.258531,0.887626,00:02
1,0.448675,0.281979,0.868687,00:02


In [38]:
learn_cls.freeze_to(-2)
learn_cls.fit_one_cycle(2, slice(lr/(2.6**4),lr), moms=(0.8,0.7))

epoch,train_loss,valid_loss,accuracy,time
0,0.448107,0.280352,0.906566,00:03
1,0.394234,0.225667,0.905303,00:03


In [0]:
learn_cls.save("1st")

In [40]:
learn_cls.freeze_to(-3)
learn_cls.fit_one_cycle(2, slice(lr/2/(2.6**4),lr/2), moms=(0.8,0.7))

epoch,train_loss,valid_loss,accuracy,time
0,0.392992,0.510586,0.848485,00:05
1,0.348136,0.240741,0.910354,00:04


In [0]:
learn_cls.save("2nd")

In [43]:
learn_cls.load("2nd")

RNNLearner(data=TextClasDataBunch;

Train: LabelList (7128 items)
x: TextList
xxbos # fingerprint # xxmaj xxunk xxmaj test xxunk xxunk xxunk xxunk xxunk xxunk xxunk # android # xxunk # beautiful # cute # health # xxunk # xxunk # iphonesia # iphone,xxbos xxmaj xxunk a xxunk silicon case xxunk xxmaj xxunk to my xxunk xxunk # yay # xxmaj xxunk # xxmaj xperia # s # xxunk xxunk http xxunk xxunk xxunk xxunk xxunk p xxunk xxunk xxunk,xxbos xxmaj we love xxunk xxunk xxmaj would you go xxunk # talk # xxunk # unplug # relax # iphone # xxunk # wifi # connect xxunk http xxunk xxunk xxunk xxunk xxunk xxunk,xxbos i xxunk xxunk i know i xxunk xxmaj xxunk i was made that way xxunk # iphone # cute # xxunk # home http xxunk xxunk xxunk xxunk xxunk p xxunk xxunk xxunk,xxbos xxmaj what xxunk xxunk xxunk xxmaj apple wo xxunk even talk to me about a question i have unless i pay them xxunk xxunk for their stupid support xxunk
y: CategoryList
0,0,0,0,1
Path: .;

Valid: LabelList (792 items)
x: TextList
xxbos 

In [44]:
learn_cls.freeze_to(-4)
learn_cls.fit_one_cycle(4, slice(lr/3/(2.6**4),lr/2), moms=(0.8,0.7))

epoch,train_loss,valid_loss,accuracy,time
0,0.330345,0.25241,0.900253,00:06
1,0.325304,0.272498,0.90404,00:05
2,0.245802,0.237148,0.90404,00:06
3,0.177711,0.252249,0.905303,00:06


In [0]:
learn_cls.save("3rd")

In [42]:
learn_cls.unfreeze()
learn_cls.fit_one_cycle(8, slice(lr/4/(2.6**4),lr/10), moms=(0.8,0.7))

epoch,train_loss,valid_loss,accuracy,time
0,0.255088,0.229717,0.906566,00:06
1,0.238522,0.234246,0.907828,00:06
2,0.233657,0.237318,0.911616,00:06
3,0.186932,0.289928,0.901515,00:06
4,0.134729,0.318431,0.896465,00:06
5,0.092766,0.314527,0.900253,00:06
6,0.068748,0.343741,0.901515,00:06
7,0.053477,0.339279,0.901515,00:06


Above step clearly overfitted

**Anyways Lets get down to our  submission**

In [0]:
learn_cls = learn_cls.load("3rd")

In [47]:
Tes_ = pd.read_csv("/content/test_oJQbWVk_text.csv")

Tes_.head()

Unnamed: 0,id,text
0,7921,I hate the new #iphone upgrade. Won't let me d...
1,7922,currently shitting my fucking pants. #apple #i...
2,7923,"I'd like to puts some CD-ROMS on my iPad, is t..."
3,7924,My ipod is officially dead. I lost all my pict...
4,7925,Been fighting iTunes all night! I only want th...


In [0]:
Tes_['text'] = Tes_['text'].apply(lambda x : preprocess_tweet(x))
Tes_ = clean_data(Tes_ , ['text'])

In [49]:
from tqdm import tqdm

predictions = []

for i in tqdm(list(Tes_.text.values)):
    predictions.append(int(learn_cls.predict(i)[1]))

100%|██████████| 1953/1953 [01:41<00:00, 19.27it/s]


In [0]:
submission = {'id' : list(Tes_.id.values) , 'label' : predictions}

In [0]:
Df = pd.DataFrame.from_dict(submission)

In [0]:
Df.to_csv("submission.csv" , header = True , index = False)