In [1]:
import pandas as pd
from tqdm import tqdm
import re
import numpy as np
from nltk.corpus import stopwords
stop = stopwords.words('english')

In [2]:
train=pd.read_csv('./data/train_E6oV3lV.csv')
test=pd.read_csv('./data/test_tweets_anuFYb8.csv')

In [3]:
train.head()
train=train.sample(frac=1)

In [4]:
combi = train.append(test, ignore_index=True,sort=False)

In [5]:
def remove_pattern(input_txt, pattern):
    r = re.findall(pattern, input_txt)
    for i in r:
        input_txt = re.sub(i, '', input_txt)
        
    return input_txt   

In [6]:
# remove twitter handles (@user)
combi['tidy_tweet'] = np.vectorize(remove_pattern)(combi['tweet'], "@[\w]*")

# remove special characters, numbers, punctuations
combi['tidy_tweet'] = combi['tidy_tweet'].str.replace("[^a-zA-Z#]", " ")

combi['tidy_tweet'] = combi['tidy_tweet'].apply(lambda x: ' '.join([w for w in x.split() if len(w)>2]))

combi['tidy_tweet'] = combi['tidy_tweet'].str.replace('#',' Hashtag ')

tokenized_tweet = combi['tidy_tweet'].apply(lambda x: x.split())
tokenized_tweet.head()

from nltk.stem.porter import *
from nltk.stem.wordnet import WordNetLemmatizer
stemmer = PorterStemmer()
lmtzr = WordNetLemmatizer()

tokenized_tweet = tokenized_tweet.apply(lambda x: [stemmer.stem(i) for i in x]) # stemming
tokenized_tweet = tokenized_tweet.apply(lambda x: [lmtzr.lemmatize(i) for i in x]) #lemmatize 
tokenized_tweet.head()

for i in range(len(tokenized_tweet)):
    tokenized_tweet[i] = ' '.join(tokenized_tweet[i])

combi['tidy_tweet'] = tokenized_tweet

In [7]:
df_train=combi[:len(train)]
df_test=combi[len(train):]

In [8]:
df_train.label.value_counts()

0.0    29720
1.0     2242
Name: label, dtype: int64

In [9]:
df_train.head()

Unnamed: 0,id,label,tweet,tidy_tweet
0,27377,0.0,if i knew before that get some suppo of very d...,knew befor that get some suppo veri difficult ...
1,3493,0.0,hello abitur ðð¼ðð #me #selfie #a...,hello abitur hashtag me hashtag selfi hashtag ...
2,21316,0.0,@user @user i have twice they've read my email...,have twice they read email but not yet answer
3,5464,0.0,blessings to those massacred or injured in the...,bless those massacr injur the hashtag pulsesho...
4,29318,0.0,b e h a p p y ã°ã°ã°ã°ã°ã°ã°ã°ã...,mscandem zero worri god gat hashtag regrann


In [10]:
from flair.models import TextClassifier
from flair.data import Sentence
classifier = TextClassifier.load('en-sentiment')

2019-10-17 11:40:19,630 loading file /Users/subir/.flair/models/imdb-v0.4.pt


In [15]:
from flair.data_fetcher import NLPTaskDataFetcher
from flair.embeddings import WordEmbeddings, FlairEmbeddings, DocumentLSTMEmbeddings , BertEmbeddings ,CharacterEmbeddings
from flair.models import TextClassifier
from flair.trainers import ModelTrainer
from pathlib import Path

In [16]:
train = df_train[['label', 'tidy_tweet']].rename(columns={"v1":"label", "v2":"text"})
train['label'] = '__label__' + train['label'].astype(str)
train.iloc[0:int(len(train)*0.8)].to_csv('train.csv', sep='\t', index = False, header = False)
train.iloc[int(len(train)*0.8):int(len(train)*0.9)].to_csv('test.csv', sep='\t', index = False, header = False)
train.iloc[int(len(train)*0.9):].to_csv('dev.csv', sep='\t', index = False, header = False);

In [27]:
corpus = NLPTaskDataFetcher.load_classification_corpus(Path('.'),train_file='train.csv',test_file='test.csv',dev_file='dev.csv')
word_embeddings = [WordEmbeddings('glove'), FlairEmbeddings('en-forward'), FlairEmbeddings('en-backward'),CharacterEmbeddings()]
document_embeddings = DocumentLSTMEmbeddings(word_embeddings, hidden_size=256, reproject_words=True, reproject_words_dimension=64)
classifier = TextClassifier(document_embeddings, label_dictionary=corpus.make_label_dictionary(), multi_label=False)


2019-10-17 11:56:20,216 Reading data from .
2019-10-17 11:56:20,229 Train: train.csv
2019-10-17 11:56:20,231 Dev: dev.csv
2019-10-17 11:56:20,232 Test: test.csv


  """Entry point for launching an IPython kernel.
  max_tokens_per_doc=max_tokens_per_doc,
  max_tokens_per_doc=max_tokens_per_doc,
  max_tokens_per_doc=max_tokens_per_doc,


2019-10-17 11:56:28,516 https://s3.eu-central-1.amazonaws.com/alan-nlp/resources/embeddings-v0.4.1/big-news-forward--h2048-l1-d0.05-lr30-0.25-20/news-forward-0.4.1.pt not found in cache, downloading to /var/folders/5m/h_g_92_s1d11s4pd5cbhs48m0000gn/T/tmp495oquh2


100%|██████████| 73034624/73034624 [01:10<00:00, 1034338.30B/s]

2019-10-17 11:57:39,924 copying /var/folders/5m/h_g_92_s1d11s4pd5cbhs48m0000gn/T/tmp495oquh2 to cache at /Users/subir/.flair/embeddings/news-forward-0.4.1.pt





2019-10-17 11:57:40,080 removing temp file /var/folders/5m/h_g_92_s1d11s4pd5cbhs48m0000gn/T/tmp495oquh2
2019-10-17 11:57:41,103 https://s3.eu-central-1.amazonaws.com/alan-nlp/resources/embeddings-v0.4.1/big-news-backward--h2048-l1-d0.05-lr30-0.25-20/news-backward-0.4.1.pt not found in cache, downloading to /var/folders/5m/h_g_92_s1d11s4pd5cbhs48m0000gn/T/tmppr8iwpjb


100%|██████████| 73034575/73034575 [01:03<00:00, 1150621.45B/s]

2019-10-17 11:58:45,498 copying /var/folders/5m/h_g_92_s1d11s4pd5cbhs48m0000gn/T/tmppr8iwpjb to cache at /Users/subir/.flair/embeddings/news-backward-0.4.1.pt





2019-10-17 11:58:45,625 removing temp file /var/folders/5m/h_g_92_s1d11s4pd5cbhs48m0000gn/T/tmppr8iwpjb
2019-10-17 11:58:45,815 Computing label dictionary. Progress:


  This is separate from the ipykernel package so we can avoid doing imports until
100%|██████████| 25562/25562 [00:00<00:00, 282472.02it/s]

2019-10-17 11:58:45,908 [b'0.0', b'1.0']





In [None]:
trainer = ModelTrainer(classifier, corpus)
trainer.train('./', max_epochs=5,checkpoint=True,monitor_train=True,mini_batch_size=8)

2019-10-17 12:02:44,232 ----------------------------------------------------------------------------------------------------
2019-10-17 12:02:44,234 Model: "TextClassifier(
  (document_embeddings): DocumentLSTMEmbeddings(
    (embeddings): StackedEmbeddings(
      (list_embedding_0): WordEmbeddings('glove')
      (list_embedding_1): FlairEmbeddings(
        (lm): LanguageModel(
          (drop): Dropout(p=0.05, inplace=False)
          (encoder): Embedding(300, 100)
          (rnn): LSTM(100, 2048)
          (decoder): Linear(in_features=2048, out_features=300, bias=True)
        )
      )
      (list_embedding_2): FlairEmbeddings(
        (lm): LanguageModel(
          (drop): Dropout(p=0.05, inplace=False)
          (encoder): Embedding(300, 100)
          (rnn): LSTM(100, 2048)
          (decoder): Linear(in_features=2048, out_features=300, bias=True)
        )
      )
      (list_embedding_3): CharacterEmbeddings(
        (char_embedding): Embedding(275, 25)
        (char_rnn): LST

  "type " + obj.__name__ + ". It won't be checked "
  "type " + obj.__name__ + ". It won't be checked "
  "type " + obj.__name__ + ". It won't be checked "
  "type " + obj.__name__ + ". It won't be checked "
  "type " + obj.__name__ + ". It won't be checked "
  "type " + obj.__name__ + ". It won't be checked "
  "type " + obj.__name__ + ". It won't be checked "
  "type " + obj.__name__ + ". It won't be checked "
  "type " + obj.__name__ + ". It won't be checked "
  "type " + obj.__name__ + ". It won't be checked "
  "type " + obj.__name__ + ". It won't be checked "


2019-10-17 18:47:45,832 ----------------------------------------------------------------------------------------------------
2019-10-17 18:47:46,769 epoch 2 - iter 0/3196 - loss 0.42494351 - samples/sec: 2862.68
2019-10-17 18:49:49,224 epoch 2 - iter 319/3196 - loss 0.18753866 - samples/sec: 20.97
2019-10-17 18:51:49,250 epoch 2 - iter 638/3196 - loss 0.18298670 - samples/sec: 21.34
2019-10-17 18:53:44,558 epoch 2 - iter 957/3196 - loss 0.17935477 - samples/sec: 22.20
2019-10-17 18:55:38,765 epoch 2 - iter 1276/3196 - loss 0.18000432 - samples/sec: 22.40
2019-10-17 18:57:39,760 epoch 2 - iter 1595/3196 - loss 0.17773952 - samples/sec: 21.15
2019-10-17 18:59:53,335 epoch 2 - iter 1914/3196 - loss 0.17436145 - samples/sec: 19.16


In [None]:
from flair.visual.training_curves import Plotter
plotter = Plotter()
plotter.plot_training_curves('loss.tsv')
plotter.plot_weights('weights.txt')

In [2]:
from flair.models import TextClassifier
from flair.data import Sentence
classifier = TextClassifier.load('./best-model.pt')
sentence = Sentence('the love and hate country')


2019-10-19 13:47:46,675 loading file ./best-model.pt


  result = unpickler.load()


In [43]:
classifier.predict(sentence)
sentence.labels[0]

0.0 (0.35794925689697266)

In [50]:
((str(sentence.labels[0]).split('(')[0]).strip()), float(str(sentence.labels[0]).split('(')[1].split(')')[0])

('0.0', 0.35794925689697266)

In [None]:
test=pd.read_csv('./data/test_tweets_anuFYb8.csv')

In [None]:
test.head()

In [54]:
from tqdm import tqdm

def get_pred(row):
    sentence = Sentence(row['tidy_tweet'])
    classifier.predict(sentence)
    label= int(float((str(sentence.labels[0]).split('(')[0])))
    if label==0:
        if float(str(sentence.labels[0]).split('(')[1].split(')')[0])>0.5:
            row['label']=0
        else:
            row['label']=1
        
    return row
    


In [56]:
df = df_test.apply(get_pred,1)

UnboundLocalError: ("local variable 'index' referenced before assignment", 'occurred at index 38777')

In [None]:
pred_df.label=pred_df.label.apply(lambda x : int(x))

In [None]:
pred_df.head()

In [None]:
pred_df

In [None]:
pred_df.drop(['tweet'],1,inplace=True)

In [None]:
pred_df.to_csv('flair_sub.csv',index=False)

In [None]:
pred_df.label.value_counts()