In [1]:
import pandas as pd
from tqdm import tqdm
import re
import numpy as np
from nltk.corpus import stop
words
stop = stopwords.words('english')

In [2]:
train=pd.read_csv('./data/train_E6oV3lV.csv')
test=pd.read_csv('./data/test_tweets_anuFYb8.csv')

In [3]:
train.head()
train=train.sample(frac=1)

In [4]:
combi = train.append(test, ignore_index=True,sort=False)

In [5]:
def remove_pattern(input_txt, pattern):
    r = re.findall(pattern, input_txt)
    for i in r:
        input_txt = re.sub(i, '', input_txt)
        
    return input_txt   

In [6]:
# remove twitter handles (@user)
combi['tidy_tweet'] = np.vectorize(remove_pattern)(combi['tweet'], "@[\w]*")

# remove special characters, numbers, punctuations
combi['tidy_tweet'] = combi['tidy_tweet'].str.replace("[^a-zA-Z#]", " ")

combi['tidy_tweet'] = combi['tidy_tweet'].apply(lambda x: ' '.join([w for w in x.split() if len(w)>2]))

combi['tidy_tweet'] = combi['tidy_tweet'].str.replace('#',' Hashtag ')

tokenized_tweet = combi['tidy_tweet'].apply(lambda x: x.split())
tokenized_tweet.head()

from nltk.stem.porter import *
from nltk.stem.wordnet import WordNetLemmatizer
stemmer = PorterStemmer()
lmtzr = WordNetLemmatizer()

tokenized_tweet = tokenized_tweet.apply(lambda x: [stemmer.stem(i) for i in x]) # stemming
tokenized_tweet = tokenized_tweet.apply(lambda x: [lmtzr.lemmatize(i) for i in x]) #lemmatize 
tokenized_tweet.head()

for i in range(len(tokenized_tweet)):
    tokenized_tweet[i] = ' '.join(tokenized_tweet[i])

combi['tidy_tweet'] = tokenized_tweet

In [7]:
df_train=combi[:len(train)]
df_test=combi[len(train):]

In [8]:
df_train.label.value_counts()

0.0    29720
1.0     2242
Name: label, dtype: int64

In [9]:
df_train.head()

Unnamed: 0,id,label,tweet,tidy_tweet
0,31831,0.0,@user it's off to watch the football game bet...,off watch the footbal game between hashtag rom...
1,6163,0.0,i was watching justin's new clip and thinking ...,wa watch justin new clip and think abt day fam...
2,15562,0.0,"@user just opened packs for 2mill coins, best ...",just open pack for mill coin best player you a...
3,17619,1.0,@user new video! super mario run is sexist! ...,new video super mario run sexist hashtag chris...
4,17074,0.0,#duschszene #fear #origins #pib #moore #temp...,hashtag duschszen hashtag fear hashtag origin ...


In [10]:
from flair.models import TextClassifier
from flair.data import Sentence
classifier = TextClassifier.load('en-sentiment')

2020-01-13 16:54:30,384 loading file /Users/subir/.flair/models/imdb-v0.4.pt


In [11]:
from flair.data_fetcher import NLPTaskDataFetcher
from flair.embeddings import WordEmbeddings, FlairEmbeddings, DocumentLSTMEmbeddings , BertEmbeddings ,CharacterEmbeddings
from flair.models import TextClassifier
from flair.trainers import ModelTrainer
from pathlib import Path

In [12]:
train = df_train[['label', 'tidy_tweet']].rename(columns={"v1":"label", "v2":"text"})
train['label'] = '__label__' + train['label'].astype(str)
train.iloc[0:int(len(train)*0.8)].to_csv('train.csv', sep='\t', index = False, header = False)
train.iloc[int(len(train)*0.8):int(len(train)*0.9)].to_csv('test.csv', sep='\t', index = False, header = False)
train.iloc[int(len(train)*0.9):].to_csv('dev.csv', sep='\t', index = False, header = False);

In [13]:
corpus = NLPTaskDataFetcher.load_classification_corpus(Path('.'),train_file='train.csv',test_file='test.csv',dev_file='dev.csv')
word_embeddings = [WordEmbeddings('glove'), FlairEmbeddings('en-forward'), FlairEmbeddings('en-backward'),CharacterEmbeddings()]
document_embeddings = DocumentLSTMEmbeddings(word_embeddings, hidden_size=128, reproject_words=True, reproject_words_dimension=64)
classifier = TextClassifier(document_embeddings, label_dictionary=corpus.make_label_dictionary(), multi_label=False)

2020-01-13 16:55:44,876 Reading data from .
2020-01-13 16:55:44,878 Train: train.csv
2020-01-13 16:55:44,879 Dev: dev.csv
2020-01-13 16:55:44,881 Test: test.csv


  """Entry point for launching an IPython kernel.
  max_tokens_per_doc=max_tokens_per_doc,
  max_tokens_per_doc=max_tokens_per_doc,
  max_tokens_per_doc=max_tokens_per_doc,


2020-01-13 16:55:53,308 Computing label dictionary. Progress:


  This is separate from the ipykernel package so we can avoid doing imports until
100%|██████████| 25561/25561 [00:00<00:00, 197428.17it/s]

2020-01-13 16:55:53,446 [b'0.0', b'1.0']





In [None]:
trainer = ModelTrainer(classifier, corpus)
trainer.train('./falir_model/', max_epochs=5,checkpoint=True,monitor_train=True,mini_batch_size=2)

2020-01-13 16:57:51,783 ----------------------------------------------------------------------------------------------------
2020-01-13 16:57:51,785 Model: "TextClassifier(
  (document_embeddings): DocumentLSTMEmbeddings(
    (embeddings): StackedEmbeddings(
      (list_embedding_0): WordEmbeddings('glove')
      (list_embedding_1): FlairEmbeddings(
        (lm): LanguageModel(
          (drop): Dropout(p=0.05)
          (encoder): Embedding(300, 100)
          (rnn): LSTM(100, 2048)
          (decoder): Linear(in_features=2048, out_features=300, bias=True)
        )
      )
      (list_embedding_2): FlairEmbeddings(
        (lm): LanguageModel(
          (drop): Dropout(p=0.05)
          (encoder): Embedding(300, 100)
          (rnn): LSTM(100, 2048)
          (decoder): Linear(in_features=2048, out_features=300, bias=True)
        )
      )
      (list_embedding_3): CharacterEmbeddings(
        (char_embedding): Embedding(275, 25)
        (char_rnn): LSTM(25, 25, bidirectional=True)


2020-01-14 05:51:04,324 epoch 5 - iter 0/12781 - loss 0.00263643 - samples/sec: 1863.09
2020-01-14 05:56:15,020 epoch 5 - iter 1278/12781 - loss 0.15423717 - samples/sec: 8.24
2020-01-14 06:01:26,476 epoch 5 - iter 2556/12781 - loss 0.15985416 - samples/sec: 8.23
2020-01-14 06:06:16,884 epoch 5 - iter 3834/12781 - loss 0.16379583 - samples/sec: 8.82
2020-01-14 06:11:01,195 epoch 5 - iter 5112/12781 - loss 0.17260489 - samples/sec: 9.01
2020-01-14 06:15:45,161 epoch 5 - iter 6390/12781 - loss 0.17370627 - samples/sec: 9.02
2020-01-14 06:20:29,758 epoch 5 - iter 7668/12781 - loss 0.17592170 - samples/sec: 9.00
2020-01-14 06:25:12,577 epoch 5 - iter 8946/12781 - loss 0.17735764 - samples/sec: 9.05
2020-01-14 06:29:53,744 epoch 5 - iter 10224/12781 - loss 0.17806942 - samples/sec: 9.11
2020-01-14 06:34:36,105 epoch 5 - iter 11502/12781 - loss 0.18129507 - samples/sec: 9.07
2020-01-14 06:39:13,122 epoch 5 - iter 12780/12781 - loss 0.18272137 - samples/sec: 9.25
2020-01-14 06:39:13,194 -----

  result = unpickler.load()


In [None]:
from flair.visual.training_curves import Plotter
plotter = Plotter()
plotter.plot_training_curves('loss.tsv')
plotter.plot_weights('weights.txt')

In [None]:
from flair.models import TextClassifier
from flair.data import Sentence
classifier = TextClassifier.load('./best-model.pt')
sentence = Sentence('the love and hate country')


In [None]:
classifier.predict(sentence)
sentence.labels[0]

In [None]:
((str(sentence.labels[0]).split('(')[0]).strip()), float(str(sentence.labels[0]).split('(')[1].split(')')[0])

In [None]:
test=pd.read_csv('./data/test_tweets_anuFYb8.csv')

In [None]:
test.head()

In [None]:
from tqdm import tqdm

def get_pred(row):
    sentence = Sentence(row['tidy_tweet'])
    classifier.predict(sentence)
    label= int(float((str(sentence.labels[0]).split('(')[0])))
    if label==0:
        if float(str(sentence.labels[0]).split('(')[1].split(')')[0])>0.5:
            row['label']=0
        else:
            row['label']=1
        
    return row
    


In [None]:
df = df_test.apply(get_pred,1)

In [None]:
pred_df.label=pred_df.label.apply(lambda x : int(x))

In [None]:
pred_df.head()

In [None]:
pred_df

In [None]:
pred_df.drop(['tweet'],1,inplace=True)

In [None]:
pred_df.to_csv('flair_sub.csv',index=False)

In [None]:
pred_df.label.value_counts()