In [33]:
import pandas as pd
import numpy as np
import re
from nltk.stem import WordNetLemmatizer
from nltk.corpus import stopwords
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import MinMaxScaler
from tensorflow.keras import layers
from tensorflow.keras import models
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from fuzzywuzzy import fuzz
import spacy
import random
from spacy.util import minibatch, compounding

In [34]:
raw_data = pd.read_csv('train.csv')
raw_data['sentiment'].replace({'neutral':0,'negative':-1,'positive':1}, inplace=True)
raw_data


Unnamed: 0,textID,text,selected_text,sentiment
0,cb774db0d1,"I`d have responded, if I were going","I`d have responded, if I were going",0
1,549e992a42,Sooo SAD I will miss you here in San Diego!!!,Sooo SAD,-1
2,088c60f138,my boss is bullying me...,bullying me,-1
3,9642c003ef,what interview! leave me alone,leave me alone,-1
4,358bd9e861,"Sons of ****, why couldn`t they put them on t...","Sons of ****,",-1
...,...,...,...,...
27476,4eac33d1c0,wish we could come see u on Denver husband l...,d lost,-1
27477,4f4c4fc327,I`ve wondered about rake to. The client has ...,", don`t force",-1
27478,f67aae2310,Yay good for both of you. Enjoy the break - y...,Yay good for both of you.,1
27479,ed167662a5,But it was worth it ****.,But it was worth it ****.,1


In [35]:
def pre_processing(x):
    x = str(x).lower()
    result = []
    for i in x.split(' '):
        if i.find('http') > -1:
            pass
        else:
            if len(i) == 1:
                pass
            else:
                result.append(i)
    x = ' '.join(result)
    REPLACE_BY_SPACE_RE = re.compile('[^0-9a-z#+ _\`]')
    x = REPLACE_BY_SPACE_RE.sub('', x)
    return x 
    

In [36]:
raw_data['cleaned_text']= raw_data['selected_text'].apply(lambda x:pre_processing(x))
raw_data['cleaned_text']

0                         i`d have responded if were going
1                                                 sooo sad
2                                              bullying me
3                                           leave me alone
4                                                 sons of 
                               ...                        
27476                                                 lost
27477                                          don`t force
27478                             yay good for both of you
27479                                but it was worth it  
27480    all this flirting going on the atg smiles yay ...
Name: cleaned_text, Length: 27481, dtype: object

In [37]:
def get_token_ration(x1, x2):
    return fuzz.token_set_ratio(str(x1), str(x2))

In [38]:
ratio_set = []
for i,j in raw_data.loc[:,['cleaned_text','text']].iterrows():
    ratio_set.append(get_token_ration(j[0],j[1]))

In [39]:
raw_data['token_ration'] = ratio_set

In [40]:
raw_data['text'] = raw_data['text'].apply(lambda x: str(x).lower())

In [41]:
raw_data.loc[:,['textID','text','cleaned_text','token_ration','sentiment']]

Unnamed: 0,textID,text,cleaned_text,token_ration,sentiment
0,cb774db0d1,"i`d have responded, if i were going",i`d have responded if were going,100,0
1,549e992a42,sooo sad i will miss you here in san diego!!!,sooo sad,100,-1
2,088c60f138,my boss is bullying me...,bullying me,100,-1
3,9642c003ef,what interview! leave me alone,leave me alone,100,-1
4,358bd9e861,"sons of ****, why couldn`t they put them on t...",sons of,100,-1
...,...,...,...,...,...
27476,4eac33d1c0,wish we could come see u on denver husband l...,lost,100,-1
27477,4f4c4fc327,i`ve wondered about rake to. the client has ...,don`t force,100,-1
27478,f67aae2310,yay good for both of you. enjoy the break - y...,yay good for both of you,100,1
27479,ed167662a5,but it was worth it ****.,but it was worth it,100,1


In [42]:
raw_data.drop(raw_data.loc[raw_data['token_ration']<80].index, inplace=True, axis=0)

In [43]:
raw_data['cleaned_text'] = raw_data['cleaned_text'].apply(lambda x : re.sub('\W\w+','',x))
raw_data['cleaned_text'] = raw_data['cleaned_text'].apply(lambda x : re.sub('\+','',x))

In [44]:
dataset_ = []
def prepare_datset(full_sen, token_sen):
    for i in token_sen.split(' '):
        if len(i)>0:
            pre = re.search(i,full_sen)
            if pre is None:
                pass
            else:
                rep = (full_sen,{'entities':[(pre.span()[0],pre.span()[1],'twitter')]})
                dataset_.append(rep)


In [45]:
for i,j in raw_data.loc[:,['text','cleaned_text']].iterrows():
    prepare_datset(j[0],j[1])


In [52]:
len(dataset_)

26575

In [53]:
def main(model=None, output_dir=None, n_iter=100,TRAIN_DATA=None):
    if model is not None:
        nlp = spacy.load(model)  # load existing spaCy model
        print("Loaded model '%s'" % model)
    else:
        nlp = spacy.blank("en")  # create blank Language class
        print("Created blank 'en' model")
    print('Checking NER pipe in model')
    if "ner" not in nlp.pipe_names:
        ner = nlp.create_pipe("ner")
        nlp.add_pipe(ner, last=True)
    # otherwise, get it so we can add labels
    else:
        ner = nlp.get_pipe("ner")
    print('Adding Entries')
    for _, annotations in TRAIN_DATA:
        for ent in annotations.get("entities"):
            ner.add_label(ent[2])
    # get names of other pipes to disable them during training
    pipe_exceptions = ["ner", "trf_wordpiecer", "trf_tok2vec"]
    other_pipes = [pipe for pipe in nlp.pipe_names if pipe not in pipe_exceptions]
    with nlp.disable_pipes(*other_pipes):  # only train NER
        # reset and initialize the weights randomly – but only if we're
        # training a new model
        print('Training Begin')
        if model is None:
            nlp.begin_training()
        for itn in range(n_iter):
            print('Iteration {}'.format(itn))
            random.shuffle(TRAIN_DATA)
            losses = {}
            # batch up the examples using spaCy's minibatch
            batches = minibatch(TRAIN_DATA, size=compounding(4.0, 32.0, 1.001))
            for batch in batches:
                texts, annotations = zip(*batch)
                nlp.update(
                    texts,  # batch of texts
                    annotations,  # batch of annotations
                    drop=0.2,  # dropout - make it harder to memorise data
                    losses=losses,
                )
            print("Losses", losses)
    nlp.to_disk(output_dir)
    print("Saved model to", output_dir)
    

In [54]:
main(None,r'C:\Users\HP\AI Projects\Learn\Kaggle\twitter_analysis',100,dataset_)

Created blank 'en' model
Checking NER pipe in model
Adding Entries
Training Begin
Iteration 0
Losses {'ner': 27422.998466562327}
Iteration 1
Losses {'ner': 25013.22993447356}
Iteration 2
Losses {'ner': 23813.280579958824}
Iteration 3


KeyboardInterrupt: 