# 1.Importing Libraries

In [1]:
import re
import string

import numpy as np
import pandas as pd

import matplotlib.pyplot as plt
import seaborn as sns
%matplotlib inline
from plotly import graph_objs as go
import plotly.express as px
import plotly.figure_factory as ff

import nltk
from nltk.corpus import stopwords

from tqdm import tqdm
from collections import Counter
from wordcloud import WordCloud, STOPWORDS, ImageColorGenerator
from PIL import Image
import os
import random

import spacy
from spacy.util import compounding
from spacy.util import minibatch

import warnings
warnings.filterwarnings("ignore")



In [2]:
import os
for dirname, _, filenames in os.walk("/kaggle/input"):
    for filename in filenames:
        print(os.path.join(dirname, filename))

/kaggle/input/tweet-sentiment-extraction/sample_submission.csv
/kaggle/input/tweet-sentiment-extraction/train.csv
/kaggle/input/tweet-sentiment-extraction/test.csv


In [None]:
def random_colors(no_of_colors):
    '''
    Simple function for random color generation
    '''
    
    colors = []
    for i in range(no_of_colors):
        colors.append("#"+"".join([random.choice("0123456ABCDEF") for j in range(6)]))
        
    return colors    

In [None]:
train = pd.read_csv("/kaggle/input/tweet-sentiment-extraction/train.csv")
test = pd.read_csv("/kaggle/input/tweet-sentiment-extraction/test.csv")
ss = pd.read_csv("/kaggle/input/tweet-sentiment-extraction/sample_submission.csv")

In [None]:
train.head()

In [None]:
print(train.shape)
print(test.shape)

In [None]:
train.info()

In [None]:
train.dropna(inplace=True)

In [None]:
test.info()

# 2.EDA

In [None]:
train.describe()

In [None]:
temp = train.groupby("sentiment")["text"].count().reset_index().sort_values(by="text",ascending=False)
temp.style.background_gradient(cmap="Purples")

In [None]:
plt.figure(figsize=(12, 6))
sns.countplot(x="sentiment", data=train);

In [None]:
fig = go.Figure(go.Funnelarea(
        text = temp.sentiment,
        values = temp.text,
        title = {"position": "top center", "text": "Funnel-Chart Of Sentiment Distribution"}

))
fig.show();

# 3. Generating additional features

In [None]:
def jaccard(str1, str2):
    '''
    Outputs the jaccard similarity score between two texts
    '''
    a = set(str1.lower().split())
    b = set(str2.lower().split())
    c = a.intersection(b)
    
    return float(len(c)/(len(a)+len(b)-len(c)))

In [None]:
res_jaccard = []

In [None]:
for ind, row in train.iterrows():
    sentence1 = row.text
    sentence2 = row.selected_text
    
    jaccard_score = jaccard(sentence1, sentence2)
    res_jaccard.append([sentence1, sentence2, jaccard_score])

In [None]:
len(res_jaccard)

In [None]:
res_jaccard[0]

In [None]:
jaccard = pd.DataFrame(res_jaccard, columns=["text", "selected_text", "jaccard_score"])
jaccard.head()

In [None]:
train = train.merge(jaccard, how="outer")
train.head()

In [None]:
train.loc[train["jaccard_score"] == 1]

In [None]:
train["num_words_st"] = train["selected_text"].apply(lambda x: len(str(x).split()))
train["num_words_mt"] = train["text"].apply(lambda x: len(str(x).split()))
train["diff_in_words"] = train["num_words_mt"] - train["num_words_st"]

In [None]:
train.head()

In [None]:
hist_data = [train["num_words_st"], train["num_words_mt"]]

group_labels = ["Selected Text", "Text"]

fig = ff.create_distplot(hist_data, group_labels, show_curve=False)
fig.update_layout(title_text="Distribution of no:of words")
fig.update_layout(
    autosize=False,
    width=900,
    height=700,
    paper_bgcolor="LightSteelBlue"
)
fig.show();

In [None]:
plt.figure(figsize=(12, 6))
viz = sns.kdeplot(train["num_words_st"], shade=True, color="r").set_title("Kernel Distribution Of No: of words")
viz = sns.kdeplot(train["num_words_mt"], shade=True, color="b")

In [None]:
train.head()

In [None]:
plt.figure(figsize=(12, 6))
viz1 = sns.kdeplot(train.loc[train["sentiment"] == "positive"]["diff_in_words"], shade=True, color="b").set_title("Kernel Distribution Of Diff In No: of words")
viz1 = sns.kdeplot(train.loc[train["sentiment"] == "negative"]["diff_in_words"], shade=True, color="r")

In [None]:
plt.figure(figsize=(12, 6))
sns.distplot(train.loc[train["sentiment"] == "neutral"]["diff_in_words"], kde=False);

In [None]:
train.loc[train["sentiment"] == "neutral"]

In [None]:
plt.figure(figsize=(12, 6))
sns.kdeplot(train.loc[train["sentiment"] == "positive"]["jaccard_score"], shade=True, color="b").set_title("KDE Of Jaccard Scores across diff sentiments")
sns.kdeplot(train.loc[train["sentiment"] == "negative"]["jaccard_score"], shade=True, color="r")
plt.show();

In [None]:
plt.figure(figsize=(12, 6))
sns.distplot(train.loc[train["sentiment"] == "neutral"]["jaccard_score"], kde=False);

# 3.Deep dive into tweets where there is high similarity b/w text and sub_text

In [None]:
k = train.loc[train["num_words_mt"] <= 2]
k.head()

In [None]:
k.sentiment.value_counts(normalize=True)*100

In [None]:
k.groupby("sentiment")["jaccard_score"].mean()*100

In [None]:
k.loc[k["sentiment"] == "positive"].sort_values(by="jaccard_score", ascending=False)

# 4.Data Cleaning

In [None]:
def clean_text(text):
    '''
    Make text lowercase, remove text in square brackets,remove links,remove punctuation
    and remove words containing numbers.
    '''
    text = str(text).lower()
    text = re.sub("\[.*?\]", "", text)
    text = re.sub("https?://\S+|www\.\S+", "", text)
    text = re.sub("<.*?>+", "", text)
    text = re.sub("[%s]" % re.escape(string.punctuation), "", text)
    text = re.sub("\n", "", text)
    text = re.sub("\w*\d\w*", "", text)
    
    return text

In [None]:
train.head()

In [None]:
train["text"] = train["text"].apply(lambda x: clean_text(x))
train["selected_text"] = train["selected_text"].apply(lambda x: clean_text(x))

In [None]:
train.head()

# 5.EDA On Most Common Words In The Text, Selected Text 

In [None]:
# finding out the most common words in the selected text(target) column
train["temp_list"] = train["selected_text"].apply(lambda x: str(x).split())
top = Counter([item for sublist in train["temp_list"] for item in sublist])
temp = pd.DataFrame(top.most_common(20), columns=["Word", "Count"])
temp.style.background_gradient(cmap="Blues")

In [None]:
fig = px.bar(temp, 
       x="Count", 
       y="Word", 
       title="Common Words In Selected Text", 
       orientation="h", 
       width=700,
       height=700,
       color="Word")
fig.show();

In [None]:
def remove_stop_words(word):
    return [y for y in word if y not in stopwords.words("english")]

In [None]:
train.head()

In [None]:
train["temp_list"] = train["temp_list"].apply(lambda word: remove_stop_words(word))

In [None]:
train.head()

In [None]:
# finding out the most common words in the selected text(target) column
top = Counter([item for sublist in train["temp_list"] for item in sublist])
temp = pd.DataFrame(top.most_common(20), columns=["Word", "Count"])
temp = temp.iloc[1:,:]
temp.style.background_gradient(cmap="Purples")

In [None]:
fig = px.treemap(temp, path=["Word"], values="Count", title="Tree of most common words in selected text")
fig.show();

In [None]:
train.head()

In [None]:
train.rename({"temp_list": "temp_list_st"}, axis=1, inplace=True)

In [None]:
train.head()

In [None]:
train["temp_list_mt"] = train["text"].apply(lambda x: str(x).split())
train["temp_list_mt"] = train["temp_list_mt"].apply(lambda x: remove_stop_words(x))
top = Counter([item for sublist in train["temp_list_mt"] for item in sublist])
temp1 = pd.DataFrame(top.most_common(20), columns=["Word", "Count"])
temp1.style.background_gradient(cmap="Blues")

In [None]:
temp1 = temp1.iloc[1:,:]
temp1.style.background_gradient(cmap="Blues")

In [None]:
fig = px.bar(temp1, 
       x="Count", 
       y="Word", 
       title="Common Words In Text", 
       orientation="h", 
       width=700,
       height=700,
       color="Word")
fig.show();

In [None]:
fig = px.treemap(temp1, path=["Word"], values="Count", title="Tree of most common words in text")
fig.show();

# 6.EDA On Most Common Words Sentiment-wise

In [None]:
pos_text = train.loc[train["sentiment"] == "positive"]
neg_text = train.loc[train["sentiment"] == "negative"]
neutral_text = train.loc[train["sentiment"] == "neutral"]

In [None]:
neutral_text.head()

In [None]:
# Most common positive words
top = Counter([item for sublist in pos_text["temp_list_st"] for item in sublist])
temp_pos = pd.DataFrame(top.most_common(20), columns=["Word", "Count"])
temp_pos = temp_pos.iloc[:,:]
temp_pos.style.background_gradient(cmap="Greens")

In [None]:
fig = px.bar(temp_pos, 
       x="Count", 
       y="Word", 
       title="Common Words In Positive Selected Text", 
       orientation="h", 
       width=700,
       height=700,
       color="Word")
fig.show();

In [None]:
# Most common negative words
top = Counter([item for sublist in neg_text["temp_list_st"] for item in sublist])
temp_neg = pd.DataFrame(top.most_common(20), columns=["Word", "Count"])
temp_neg = temp_neg.iloc[1:,:]
temp_neg.style.background_gradient(cmap="Reds")

In [None]:
fig = px.bar(temp_neg, 
       x="Count", 
       y="Word", 
       title="Common Words In Negative Selected Text", 
       orientation="h", 
       width=700,
       height=700,
       color="Word")
fig.show();

In [None]:
# Most common neutral words
top = Counter([item for sublist in neutral_text["temp_list_st"] for item in sublist])
temp_neu = pd.DataFrame(top.most_common(20), columns=["Word", "Count"])
temp_neu = temp_neu.iloc[1:,:]
temp_neu.style.background_gradient(cmap="Blues")

In [None]:
fig = px.bar(temp_neu, 
       x="Count", 
       y="Word", 
       title="Common Words In Neutral Selected Text", 
       orientation="h", 
       width=700,
       height=700,
       color="Word")
fig.show();

# 6.Deep dive into unique words in each sentiment

In [None]:
raw_text = [word for word_list in train["temp_list_mt"] for word in word_list]

In [None]:
len(raw_text)

In [None]:
raw_text[1]

In [None]:
l1 = raw_text[:7]

In [None]:
l1

In [None]:
count = Counter()

In [None]:
for word in l1:
    count[word] += 1

In [None]:
count

In [None]:
count.most_common(2)

In [None]:
def words_unique(sentiment, numwords, raw_words):
    '''
    Input:
        sentiment - Segment category (ex. 'Neutral');
        numwords - how many specific words do you want to see in the final result; 
        raw_words - list  for item in train_data[train_data.segments == segments]['temp_list1']:
    Output: 
        dataframe giving information about the name of the specific ingredient and how many times it occurs in the chosen cuisine (in descending order based on their counts)..

    '''
    
    allother = []
    for item in train[train["sentiment"] != sentiment]["temp_list_mt"]:
        for word in item:
            allother.append(word)
    allother = list(set(allother))
    
    specificonly = [x for x in raw_text if x not in allother]
    
    mycounter = Counter()
    
    for item in train[train.sentiment == sentiment]["temp_list_mt"]:
        for word in item:
            mycounter[word] += 1
    keep = specificonly
    
    for word in list(mycounter):
        if word not in keep:
            del mycounter[word] 
    
    unique_words = pd.DataFrame(mycounter.most_common(numwords), columns=["words", "count"])
    
    return unique_words

In [None]:
unique_positive = words_unique("positive", 20, raw_text)
unique_positive.style.background_gradient(cmap="Greens")

In [None]:
pip install palettable

In [None]:
import palettable
from palettable.colorbrewer.qualitative import Pastel1_7
plt.figure(figsize=(16, 10))
my_circle = plt.Circle((0, 0), 0.7, color="white")
plt.pie(unique_positive["count"], labels=unique_positive["words"], colors=Pastel1_7.hex_colors)
p = plt.gcf()
p.gca().add_artist(my_circle)
plt.title("Donut Plot Of Unique Positive Words");

In [None]:
unique_negative = words_unique("negative", 20, raw_text)
unique_negative.style.background_gradient(cmap="Greens")

In [None]:
unique_neutral = words_unique("neutral", 20, raw_text)
unique_neutral.style.background_gradient(cmap="Blues")

# Building WordCloud

In [None]:
def plot_wordcloud(text, mask=None, max_words=200, max_font_size=100, fig_size=(24.0, 16.0), color="white", title=None, title_size=40, image_color=False):
    stopwords = set(STOPWORDS)
    more_stopwords = {"u", "im"}
    stopwords = stopwords.union(more_stopwords)
    
    wordcloud = WordCloud(background_color=color,
                          stopwords=stopwords,
                          max_words=max_words,
                          max_font_size=max_font_size,
                          random_state=42,
                          width=400,
                          height=200,
                          mask=mask)
    wordcloud.generate(str(text))
    
    plt.figure(fig_size=fig_size)
    
    if image_color:
        image_colors = ImageColorGenerator(mask)
        plt.imshow(wordcloud.recolor(color_func=image_colors), interpolation="bilinear")
        plt.title(title, fontdict={"size": title_size, "verticalalignment": "bottom"})
        
    else:
        plt.imshow(wordcloud);
        plt.title(title, fontdict={"size": title_size, "color": "black", "verticalalignment": "bottom"})
        
    plt.axis("off")
    plt.tight_layout()
d = "/kaggle/input/masks-for-wordclouds/"        

In [None]:
neutral_text.columns

In [None]:
neu_mask = np.array(Image.open(d+"twitter_mask.png"))
plot_wordcloud(neutral_text.text, mask=neu_mask, color="white", title="Wordcloud Of Neutral Tweets");

# Modelling

- Modelling as a NER(Named Entity Recognition) problem
-- For neutral and for tweets having no:of words less than 3, we use selected text column as the text due to high jaccard(score=1)
-- Building separate models for both positive and negative sentiments    

In [None]:
df_train = pd.read_csv('/kaggle/input/tweet-sentiment-extraction/train.csv')
df_test = pd.read_csv('/kaggle/input/tweet-sentiment-extraction/test.csv')
df_submission = pd.read_csv('/kaggle/input/tweet-sentiment-extraction/sample_submission.csv')

In [None]:
df_train.head()

In [None]:
df_train.shape

In [None]:
df_train = df_train.dropna()

In [None]:
df_train["num_words_text"] = df_train["text"].apply(lambda x: len(str(x).split()))

In [None]:
df_train.head()

In [None]:
df_train = df_train.loc[df_train["num_words_text"]>=3]

In [None]:
def save_model(output_dir, nlp, new_model_name):
    '''
    This func saves the model to the given output directory
    '''
    output_dir = f"../working/{output_dir}"
    if output_dir is not None:
        if not os.path.exists(output_dir):
            os.makedirs(output_dir)
        
        nlp.meta["name"] = new_model_name
        nlp.to_disk(output_dir)
        print(f"Saved model to {output_dir}")

In [None]:
from spacy.training.example import Example
from spacy.pipeline.ner import DEFAULT_NER_MODEL

In [None]:
def train_model(train_data, output_dir, n_iter=20, model=None):
    '''
    This function:
        1. Load the model
        2. Setup the pipeline
        3. Train the entity recogniser
    
    '''
    
    if model is not None:
        nlp = spacy.load(output_dir) # loads existing spacy model
        print("Loaded model '%s'" % model)
    else:
        nlp = spacy.blank("en") # creates blank language class
        print("Created blank 'en' model")
    
    # creating the built-in pipeline components and adding them to the pipe
    if "ner" not in nlp.pipe_names:
        ner = nlp.create_pipe("ner")
        nlp.add_pipe("ner", last=True)
    # otherwise, fetch so that we can add labels    
    else:
        ner = nlp.get_pipe("ner")
        
    # adding labels
    for _, annotations in train_data:
        entities = annotations.get("entities")
        for ent in entities:
            ner.add_label(ent[2])
    
    # getting the names of other pipes to disbale them during training
    other_pipes = [pipes for pipes in nlp.pipe_names if pipes != "ner"]   
    
    with nlp.disable_pipes(*other_pipes): # disbales all the pipes except "ner"
        
        if model is None:
            nlp.begin_training()
        else:
            nlp.resume_training()
            
        # batch up the examples using spacy minibatch
        for itn in tqdm(range(n_iter)):
            random.shuffle(train_data)
            batches = minibatch(train_data, size=compounding(4.00, 500.0,1.001))
            losses = {}
            for batch in batches:
                text, annotations = zip(*batch)
                examples = []
                for i in range(len(text)):
                    doc = nlp.make_doc(text[i])
                    examples.append(Example.from_dict(doc, annotations[i]))
                nlp.update(examples, 
                           drop=0.5, # dropout - makes it harder to memorize data
                           losses=losses)
            print("Losses", losses)    
            
    save_model(output_dir, nlp, "st_ner")        

In [None]:
def get_model_out_path(sentiment):
    '''
    Returns the model output path
    '''
    
    model_out_path = None
    if sentiment == "positive":
        model_out_path = "models/model_pos"
    elif sentiment == "negative":
        model_out_path = "models/model_neg"
    else:
        model_out_path = "models/model_neu"
        
    return model_out_path    

In [None]:
def get_training_data(sentiment):
    '''
    Returns the training data in the format needed to train the spacy NER model
    '''
    train_data = []
    for index, rows in df_train.iterrows():
        if rows.sentiment == sentiment:
            selected_text = rows.selected_text
            text = rows.text
            start = text.find(selected_text)
            end = start + len(selected_text)
            train_data.append((text, {"entities": [(start, end, 'selected_text')]}))
    return train_data        

In [None]:
df_train.head()

In [None]:
for i, r in df_train.iterrows():
    if r.sentiment == "positive":
        text = r.text
        selected_text = r.selected_text
        start = text.find(selected_text)
        end = start + len(selected_text)

In [None]:
start

In [None]:
end

In [None]:
len(selected_text)

In [None]:
df_train[df_train.text == ' But it was worth it  ****.']

In [None]:
text

In [None]:
selected_text

In [None]:
sentiment = "positive"

train_data = get_training_data(sentiment)
model_path = get_model_out_path(sentiment)

train_model(train_data, model_path, n_iter=3, model=None)

In [None]:
sentiment = "negative"

train_data = get_training_data(sentiment)
model_path = get_model_out_path(sentiment)

train_model(train_data, model_path, n_iter=3, model=None)

In [None]:
sentiment = "neutral"

train_data = get_training_data(sentiment)
model_path = get_model_out_path(sentiment)

train_model(train_data, model_path, n_iter=3, model=None)

## Predicting with the trained models

In [None]:
models_base_path = '../input/tse-spacy-model/models/'
selected_text = []

In [None]:
def predict_entities(text, model):
    doc = model(text)
    ent_arr = []
    for ent in doc.ents:
        start = text.find(ent.text)
        end = start + len(ent.text)
        new_int = [start, end, ent.label]
        if new_int not in ent_arr:
            ent_arr.append([start, end, ent.label])
    selected_text = text[ent_arr[0][0]: ent_arr[0][1] if len(ent_arr) > 0 else text]

In [None]:
if models_base_path is not None:
    print(f"Loading models from {models_base_path}")
    model_pos = spacy.load(models_base_path + "model_pos")
    model_neg = spacy.load(models_base_path + "model_neg")
    model_neu = spacy.load(models_base_path + "model_neu")
    
    for index, row in df_test.iterrows():
        text = row.text
        output_str = ""
        if row.sentiment == "neutral" or len(text.split()) <= 2:
            selected_text.append(text)
        elif row.sentiment == "positive":
            selected_text.append(predict_entities(text, model_pos))
        else:
            selected_text.append(predict_entities(text, model_neg))

df_test["selected_text"] = selected_text            