In [1]:
!pip install fuzzywuzzy
!pip install vaderSentiment




In [2]:
import os
import re
import sys
import nltk
import json
import collections
import numpy as np
import pandas as pd
from nltk import ngrams
from fuzzywuzzy import fuzz 
from collections import Counter
import matplotlib.pyplot as plt
from wordcloud import WordCloud
from nltk.stem import WordNetLemmatizer
from nltk.corpus import stopwords, sentiwordnet as swn
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.decomposition import LatentDirichletAllocation
from vaderSentiment.vaderSentiment import SentimentIntensityAnalyzer



In [3]:
nltk.download('punkt')
nltk.download('stopwords')

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


True

In [4]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


## Configuration cell

In [5]:
#All input and output files are present in this folder
data_path = '/content/drive/MyDrive/Bridgei2i Inter-IIT NLP Competition/Submit_data/'
#Initializing the brand name list
BN1 = np.array(['sitronics','apple','cherry','nexian','polytron','vestel','micromax','wileyfox','xiaomi','genpro','starmobile','lenovo','blackberry','honor','fujitsu','karbonn','reliance',
        'mobiistar','celkon', 'caterpillar', 'lg', 'philips','hcl', 'google','coolpad','gionee','realme', 'motorola', 'lenovo', 'oneplus', 'videocon'])
BN2 = np.array(['lava', 'tecno', 'ziox', 'poco', 'tcl','spc', 'x-tigi', 'sony', 'condor', 'himax',  'asus', 'dtac', 'hp', 'gresso', 'ais',  'akai', 'e-boda',  'amoi', 'advan','purism, spc', 'zen', 'just5',  'konka', 
        'dbtel',   'iball', 'm dot', 'cubot', 'imo', 'evolio',  'acer', 'wiko','yotaphone', 'ivoomi', 'aselsan', 'spice', 'blu', 'cloudfone', 'myphone', 'megafon', 'firefly',
        '10.or', 'jio', 'torque','highscreen', 'salora', 'tiptel', 'bbk', 'nec', 'haier', 'ulefone', 'ningbo bird', 'wasam', 'hicore', 'axioo','koryolink',  'dopod', 
        'iqoo', 'medion', 'manta', 'kt tech', 'nextbit', 'vinsmart', 'venera', 'nubia', 'lanix','groupe bull', 'comio', 'osmo', 'mito', 'vivo', 'technisat','andromax', 'mitsubishi', 'zte', 
        'sansui', 'meitu', 'ninetology', 'creo', 'explay','gigaset', 'panasonic', 'samsung', 'tecno', 'texet','smartisan', 'verzo', 'vodafone', 'positivo', 'obi worldphone',
        'jrc', 'bullitt', 'allview', 'garmin','intex', 'centric', 'zyrex', 'thuraya','infosonics', 'oppo', 'black shark', 'beeline', 'foxconn','archos', 'asiafone', 'kyocera', 'kyoto',
        'swipe', 'kruger&matz', 'pantech', 'wellcom', 'gtel','vsun', 'microsoft', 'blu', 'walton','casio', 'okwu', 'yu televentures', 'roverpc', 'sico', 'olivetti', 
        'mphone', 'gigabyte', 'mobiwire', 'gfive', 'hisense', 'datawind', 'evercoss', 'nokia', 'meizu','maxtron', 'infinix', 'utok', 'jablotron',
        'brondi', 'bq', 'iball', 'alcatel', 'detel', 'voice', 'mls','htc', 'intex', 'hmd', 'blackberry', 'luna', 'mts', 'kult','zopo', 'onida', 'itel', 'huawei', 'evertektunisie',
        'toshiba', 'jolla', 'i-mobile', 'leeco','umi', 'hitachi', 'smartron', 'benq','infocus', 'lyf', 'gradiente','bittium', 'xolo', 'zonda', 'doro', 'vitell'])


# Brand Identification

In [6]:
def divideDataFrames(file_path,col_name,MT_preds_df):
  #df = pd.read_excel(file_path)
  df = pd.read_csv(file_path)
  df = pd.merge(df, MT_preds_df[['Text_ID','Mobile_Tech_Flag_Predicted']], on='Text_ID') 
  mobile_tech_text = df.loc[df[col_name]==1]
  non_mobile_tech_text = df.loc[df[col_name]==0]
  return mobile_tech_text, non_mobile_tech_text

In [7]:
# supporting functions

def Func_tokens_create(tweet):
  list_of_tokens=tweet.split(" ")
  return list_of_tokens

def ratio_length(token,mobile):
  if min(len(token),len(mobile))/ max(len(token),len(mobile)) >0.7:
    return True
  else:
    return False

def Fetch_data(file_path):
  df=pd.read_csv(file_path)
  return df

def Find_Brands(df,cleaned_file):
  text_brand_names=[]
  matched_words=[]
  text_id = []
  for count,each_text in enumerate(cleaned_file):
    current_text_brand_names = []  
    current_matched_words = []
    tokenize_text = Func_tokens_create(each_text)
    
    for each_token in np.unique(tokenize_text):
      if each_token.isalnum():
        scores_list=[fuzz.ratio(each_token, word)/100 for word in BN1]
        if max(scores_list)>0.75:
          index= scores_list.index(max(scores_list))#here only top brand i have consider # we can add more if score is tie
          if ratio_length(each_token,BN1[index]):
            brand_name=BN1[index]
            current_text_brand_names.append(brand_name)
            current_matched_words.append(each_token)
        if each_token in BN2:
          current_text_brand_names.append(each_token)
          current_matched_words.append(each_token)
    text_brand_names.append(current_text_brand_names)  
    matched_words.append(current_matched_words)
  return text_brand_names, matched_words
  

In [8]:
def Brand_Identification_train():

  #Processing the input data
  MTAdf, NMTAdf = divideDataFrames(data_path+'dev_data_article.xlsx','Mobile_Tech_Flag')
  MTTdf, NMTTdf = divideDataFrames(data_path+'dev_data_tweet.xlsx','Mobile_Tech_Tag')

  #Processing the tweets
  df_tweet = Fetch_data(data_path+'dev_data_tweet_translated.csv')
  df_tweet = df_tweet[['Tweet_ID','Tweet', 'Mobile_Tech_Tag']]
  df_tweet['remove_lower_punct'] = df_tweet['Tweet'].str.lower().str.replace("'", '').str.replace('[^\w\s]', ' ').str.replace(" \d+", " ").str.replace(' +', ' ').str.strip()
  df_tweet['remove_lower_punct'] = df_tweet.apply(lambda row: re.sub(r"http\S+", "", row['remove_lower_punct']), axis=1)
  final_cleaned_tweets = df_tweet['remove_lower_punct'].to_numpy()
  tweet_brand_names, tweet_matched_words = Find_Brands(df_tweet,final_cleaned_tweets)

  #Processing the articles
  df_articles = Fetch_data(data_path+'Mob_tech_articles.csv')
  df_articles = df_articles[['Text_ID','Text', 'Mobile_Tech_Flag']]
  df_articles['remove_lower_punct'] = df_articles['Text'].str.lower().str.replace("'", '').str.replace('[^\w\s]', ' ').str.replace(" \d+", " ").str.replace(' +', ' ').str.strip()
  df_articles['remove_lower_punct'] = df_articles.apply(lambda row: re.sub(r"http\S+", "", row['remove_lower_punct']), axis=1)
  final_cleaned_articles = df_articles['remove_lower_punct'].to_numpy()
  article_brand_names, article_matched_words = Find_Brands(df_articles,final_cleaned_articles)
  
  
  Output_Dataframe = pd.DataFrame()
  Output_Dataframe['Text_ID'] = np.append(df_articles['Text_ID'],df_tweet['Tweet_ID'])
  Output_Dataframe['Mobile_Tech_Flag_Predicted'] =  ""
  Output_Dataframe['Brands_Entity_Identified'] =  np.append(article_brand_names,tweet_brand_names)
  Output_Dataframe['Sentiment_Identified'] =  ""
  Output_Dataframe['Text'] =  np.append(final_cleaned_articles,final_cleaned_tweets)
  
  temp = Output_Dataframe.apply(pd.Series.explode)

  NM_Output_Dataframe =pd.DataFrame()
  NM_Output_Dataframe['Text_ID'] =  np.append(NMTAdf['Text_ID'],NMTTdf['Tweet_ID'])
  NM_Output_Dataframe['Mobile_Tech_Flag_Predicted'] =  ""
  NM_Output_Dataframe['Brands_Entity_Identified'] =  ""
  NM_Output_Dataframe['Sentiment_Identified'] =  ""
  NM_Output_Dataframe['Text'] =  np.append(NMTAdf['Text'],NMTTdf['Tweet'])
  
  result = pd.concat([temp,NM_Output_Dataframe])
  result.to_csv(data_path+'FINAL_OUTPUT_2.csv')
  print("Train Brands Extracted Successfully!")


In [9]:
def Brand_Identification_test():

  #Processing the input data
  MT_Flag_predicted = pd.read_csv(data_path+'valid_data_1.csv')
  MT_texts, NMT_texts = divideDataFrames(data_path+'evaluation_data_translated.csv','Mobile_Tech_Flag_Predicted',MT_Flag_predicted)
  
  #Processing the tweets
  df_text = MT_texts
  df_text = df_text[['Text_ID','Text', 'Mobile_Tech_Flag_Predicted']]
  df_text['remove_lower_punct'] = df_text['Text'].str.lower().str.replace("'", '').str.replace('[^\w\s]', ' ').str.replace(" \d+", " ").str.replace(' +', ' ').str.strip()
  df_text['remove_lower_punct'] = df_text.apply(lambda row: re.sub(r"http\S+", "", row['remove_lower_punct']), axis=1)
  final_cleaned_text = df_text['remove_lower_punct'].to_numpy()
  text_brand_names, text_matched_words = Find_Brands(df_text,final_cleaned_text)
  
  Output_Dataframe = pd.DataFrame()
  Output_Dataframe['Text_ID'] = MT_texts['Text_ID']
  Output_Dataframe['Mobile_Tech_Flag_Predicted'] = MT_texts['Mobile_Tech_Flag_Predicted']
  Output_Dataframe['Brands_Entity_Identified'] =  text_brand_names
  Output_Dataframe['Sentiment_Identified'] =  ""
  Output_Dataframe['Text'] =  final_cleaned_text
  
  temp = Output_Dataframe.apply(pd.Series.explode)

  NM_Output_Dataframe =pd.DataFrame()
  NM_Output_Dataframe['Text_ID'] =  NMT_texts['Text_ID']
  NM_Output_Dataframe['Mobile_Tech_Flag_Predicted'] =  NMT_texts['Mobile_Tech_Flag_Predicted']
  NM_Output_Dataframe['Brands_Entity_Identified'] =  ""
  NM_Output_Dataframe['Sentiment_Identified'] =  ""
  NM_Output_Dataframe['Text'] =  NMT_texts['Text']
  
  result = pd.concat([temp,NM_Output_Dataframe])
  result.to_csv(data_path+'Evaluation_Output_2.csv')
  print("Evaluation Brands Extracted Successfully!")


In [10]:
#Brand_Identification_train()
Brand_Identification_test()

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  # Remove the CWD from sys.path while we load stuff.
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  # This is added back by InteractiveShellApp.init_path()


Evaluation Brands Extracted Successfully!


# Single Sentiment Analysis

In [11]:
def find_sentiment(sentence_list):
    analyser = SentimentIntensityAnalyzer()

    sentiment_score_list = []
    sentiment_label_list = []

    for sentence in sentence_list:
        sentiment_score = analyser.polarity_scores(sentence)

        if sentiment_score['compound'] >= 0.05:
            sentiment_score_list.append(sentiment_score['compound'])
            sentiment_label_list.append('Positive')
        elif sentiment_score['compound'] > -0.05 and sentiment_score['compound'] < 0.05:
            sentiment_score_list.append(sentiment_score['compound'])
            sentiment_label_list.append('Neutral')
        elif sentiment_score['compound'] <= -0.05:
            sentiment_score_list.append(sentiment_score['compound'])
            sentiment_label_list.append('Negative')
        
    return sentiment_label_list, sentiment_score_list

# Multi Brand Sentiment Analysis

In [12]:
def most_frequent(List): 
    occurence_count = Counter(List) 
    return occurence_count.most_common(1)[0][0]

def partition_one(brand_list, sentence):
    middle_part_list = []
    brand=brand_list[0]
    all_part = sentence.split(brand)
    p1=all_part[0].split()
    p2=all_part[1].split()
    middle_part = ' '.join(p1[len(p1)//2:])+' '+brand+' '
    middle_part += ' '.join(p2[:len(p2)//2])
    middle_part_list.append(middle_part)
    return middle_part_list

def partition_three(brand_list, sentence):
    prefix_part_list = []
    middle_part_list = []
    suffix_part_list = []
    brand=brand_list[0]
    all_part = sentence.split(brand)
    prefix_part = ''.join(all_part[0])+' '+brand
    suffix_part = brand+' '+''.join(all_part[1:])
    splitted = prefix_part.split(' ')
    p1=all_part[0].split()
    p2=all_part[1].split()
    middle_part = ' '.join(p1[len(p1)//2:])+' '+brand+' '
    middle_part += ' '.join(p2[:len(p2)//2])
    prefix_part_list.append(prefix_part)
    middle_part_list.append(middle_part)
    suffix_part_list.append(suffix_part)
    return prefix_part_list, middle_part_list, suffix_part_list

In [13]:
def sub_main(brand_names_list,tweet_or_text):
  sentiment_list=[]
  if len(tweet_or_text) <= 280:
    m = partition_one(brand_names_list,tweet_or_text)
    #print(m)
    result=find_sentiment(m)
    sentiment_list.append(result[0][0])

  else:
    p, m, s = partition_three(brand_names_list, tweet_or_text)
    sentiments = [find_sentiment(p),
                  find_sentiment(m),
                  find_sentiment(s)]
    sentiments_list= []
    for i in range(len(sentiments)):
      sentiments_list.append(sentiments[i][0])
 
    sentiments_np=np.array(sentiments_list)
    for i in range(0,sentiments_np.shape[1]):
      sentiment_list.append(most_frequent(sentiments_np[:,i]))
  return sentiment_list

In [14]:
def main(eval_data_path):
  Brand_Identification_test()
  print("Brand Identification Process is done!!!")
  df=pd.read_csv(eval_data_path,index_col=0)
  print("Total Records",len(df))
  for row in range(0,len(df)):
    brand_names_list=df['Brands_Entity_Identified'].iloc[row]
    text=df['Text'].iloc[row]
    if row%100==0:
      print('Sentiments for',row,'records have been computed so far!!!')
    if brand_names_list is  np.nan:
      continue
    result=sub_main(brand_names_list,text)
    df['Sentiment_Identified'].iloc[row]=result[0]
  new_df=df[['Text_ID','Mobile_Tech_Flag_Predicted','Brands_Entity_Identified','Sentiment_Identified']]
  new_df.to_csv('/content/drive/MyDrive/Bridgei2i Inter-IIT NLP Competition/Submit_data/Evaluation_Output_2.csv',index=False)


In [15]:
eval_data_path=data_path+'Evaluation_Output_2.csv'
main(eval_data_path)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  # Remove the CWD from sys.path while we load stuff.
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  # This is added back by InteractiveShellApp.init_path()


Evaluation Brands Extracted Successfully!
Brand Identification Process is done!!!
Total Records 530
Sentiments for 0 records have been computed so far!!!


A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  iloc._setitem_with_indexer(indexer, value)


Sentiments for 100 records have been computed so far!!!
Sentiments for 200 records have been computed so far!!!
Sentiments for 300 records have been computed so far!!!
Sentiments for 400 records have been computed so far!!!
Sentiments for 500 records have been computed so far!!!
