<a href="https://colab.research.google.com/github/Brijeshtanwar/Topic-Modeling/blob/main/sklearn_nlp_fasttext.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
pd.set_option('display.max_columns', None)
from IPython.core.interactiveshell import InteractiveShell
InteractiveShell.ast_node_interactivity = "all"

# NLP libraries
import nltk
import re
from nltk.stem import WordNetLemmatizer
from nltk.corpus import stopwords

In [2]:
nltk.download("stopwords")
nltk.download('wordnet')

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.


True

[nltk_data] Downloading package wordnet to /root/nltk_data...


True

In [3]:
url = """https://raw.githubusercontent.com/Brijeshtanwar/Topic-Modeling/main/voc_data.csv"""
df = pd.read_csv(url,  encoding = 'unicode_escape')

In [4]:
df = df[['CUSTOMER COMMENT', 'Reason']]
df.columns = ['comment', 'topic']

In [6]:
# Drop na
df.dropna(axis=0, inplace=True)

In [8]:
#lower case
df['topic'] = df['topic'].str.lower()
df['topic'] = df['topic'].str.strip()
df.loc[df.topic=='not specified', 'topic'] = 'others'

In [10]:
topic_dic = {'application (web/app)': 'application',  'charges(bro./dp/lpc etc.)': 'charges', 'service':'service'
, 'others':'others', 'process':'process', 'people':'people', 'product':'product', 'research':'research'}

In [11]:
df['topic'] = df['topic'].apply(lambda x: topic_dic[x] if x in topic_dic else x)

In [13]:
# Preprocessing

In [14]:
# Remove integer comments
df = df[df['comment'].apply(lambda x: not isinstance(x, int))]
df = df[df['comment'].apply(lambda x: not isinstance(x, float))]

df.reset_index(drop=True, inplace=True)

#lower case
df['comment'] = df['comment'].str.lower()

In [15]:
def preprocess_text(text, flg_stemm=False, flg_lemm=True, lst_stopwords=None):
    ## clean (convert to lowercase and remove punctuations and  characters and then strip)
    text = re.sub(r"[^A-Za-z]+", ' ' , str(text).lower().strip())

    ## Tokenize (convert from string to list)
    lst_text = text.split()    ## remove Stopwords
    # if lst_stopwords is not None:
    #     lst_text = [word for word in lst_text if word not in lst_stopwords]

    ## Stemming (remove -ing, -ly, ...)
    if flg_stemm == True:
        ps = nltk.stem.porter.PorterStemmer()
        lst_text = [ps.stem(word) for word in lst_text]

    ## Lemmatisation (convert the word into root word)
    if flg_lemm == True:
        lem = nltk.stem.wordnet.WordNetLemmatizer()
        lst_text = [lem.lemmatize(word) for word in lst_text]

    ## back to string from list
    text = " ".join(lst_text)
    return text

In [16]:
lst_stopwords = nltk.corpus.stopwords.words("english")
lst_stopwords.remove('no')
lst_stopwords.remove('not')

In [17]:
df["x"] = df["comment"].apply(lambda x: preprocess_text(x, flg_stemm=False, flg_lemm=False, lst_stopwords=lst_stopwords))
# df['x'] = df['comment']
df['y'] = df['topic']

Unnamed: 0,comment,topic,x,y
0,querries where kotak is at disadvantageous pos...,service,querries where kotak is at disadvantageous pos...,service
1,1) same-day money deduction even if we have ma...,application,same day money deduction even if we have margi...,application
2,ndejjejdjx,others,ndejjejdjx,others
3,worst website trading,application,worst website trading,application
4,"no technical support at real time, application...",service,no technical support at real time application ...,service
5,need knowledge and experience person,service,need knowledge and experience person,service
6,please revisit my brokerage and deemat charges...,charges,please revisit my brokerage and deemat charges...,charges
7,iâm waiting for resolution or a meeting with...,service,i m waiting for resolution or a meeting with m...,service
8,i dint interacted so my rating may be a bias b...,service,i dint interacted so my rating may be a bias b...,service
9,no issues with the rm,service,no issues with the rm,service


In [18]:
df = df[['x', 'y']]

In [20]:
# Prefixing each row of the category column with '__label__'
df.iloc[:, 1] = df.iloc[:, 1].apply(lambda x: '__label__' + x)

In [31]:
from sklearn.model_selection import train_test_split
df_train, df_test = train_test_split(df, test_size=0.01, stratify=df['y'], random_state=324)

In [32]:
import csv

In [33]:
pip install fasttext




In [34]:
import fasttext
# Saving the CSV file as a text file to train/test the classifier
df_train[['y', 'x']].to_csv('train.txt',
                                          index = False,
                                          sep = ' ',
                                          header = None,
                                          quoting = csv.QUOTE_NONE,
                                          quotechar = "",
                                          escapechar = " ")

df_test[['y', 'x']].to_csv('test.txt',
                                     index = False,
                                     sep = ' ',
                                     header = None,
                                     quoting = csv.QUOTE_NONE,
                                     quotechar = "",
                                     escapechar = " ")


# Training the fastText classifier
model = fasttext.train_supervised('train.txt',lr=0.5, epoch=25, wordNgrams=2, bucket=200000, dim=50)

# Evaluating performance on the entire test file
model.test('test.txt')

# Predicting on a single input
# model.predict(ds.iloc[2, 0])

# Save the trained model
# model.save_model('model.bin')
# model = fasttext.load_model("model.bin")

(1528, 0.8049738219895288, 0.8049738219895288)

In [38]:
from google.colab import drive
drive.mount('/content/drive')
model_path = "/content/drive/MyDrive/fasttext/fasttext_model_jun.bin"

model = fasttext.load_model("fasttext_model_jun.bin")

Mounted at /content/drive


In [43]:
url = """https://raw.githubusercontent.com/Brijeshtanwar/Topic-Modeling/main/DS-July'23%20NPS%20Data.csv"""
df_test = pd.read_csv(url,  encoding = 'unicode_escape')

In [47]:
df_test.rename(columns={'Primary Comment: Do you have any other suggestion or feedback for us?':'x'},inplace=True)

In [51]:
df_test["x"] = df_test["x"].apply(lambda x: preprocess_text(x, flg_stemm=False, flg_lemm=False, lst_stopwords=lst_stopwords))

In [53]:
df_test_pred = df_test
df_test_pred['y_pred'] = df_test_pred['x'].apply(model.predict)
df_test_pred['y_pred']=df_test_pred['y_pred'].str[0:1]
df_test_pred['y_pred'] = df_test_pred['y_pred'].astype(str)
df_test_pred['y_pred'] = df_test_pred['y_pred'].str[12:-5]
# df_test_pred['y'] = df_test_pred['y'].str[9:]

In [55]:
# Manual keyword mapping
word_dict = {
    'product': 'product',
    'charge': 'charges',
    'brokerage': 'charges',
    'brokerages': 'charges',
    'charges': 'charges',
    'staff': 'people',
    'employees': 'people',
    'employee': 'people',
    'dealer': 'people',
    'delar': 'people',
    ' rm ': 'people',
    'relation': 'people',
    'manager': 'people',
    ' rmo ': 'people',
    'managers': 'people'
}

for i in word_dict:
  df_test_pred.loc[(df_test_pred['x'].str.contains(i), 'y_pred')] = word_dict[i]

In [57]:
# keyword mapping for others:
keyword_class_mapping = {
    "SLOW": "APPLICATION (WEB/APP)",
    "GLITCH": "APPLICATION (WEB/APP)",
    "UI": "APPLICATION (WEB/APP)",
    "FEEDS ISSUE": "APPLICATION (WEB/APP)",
    "PORTFOLIO ISSUE": "APPLICATION (WEB/APP)",
    "ORDER EXECUTION": "APPLICATION (WEB/APP)",
    "REPORTS ISSUE": "APPLICATION (WEB/APP)",
    "FEATURES-REALTIME CHART MISSING": "APPLICATION (WEB/APP)",
    "DELAY MARGIN UPDATION": "APPLICATION (WEB/APP)",
    "ROI": "CHARGES(BRO./DP/LPC etc.)",
    "BROK": "CHARGES(BRO./DP/LPC etc.)",
    "DP": "CHARGES(BRO./DP/LPC etc.)",
    "SMALLCASE": "CHARGES(BRO./DP/LPC etc.)",
    "RESEARCH RECOMMENDATION": "SERVICE",
    "DELAY IN CONNECTING CS": "SERVICE",
    "PAYIN/PAYOUT": "SERVICE",
    "ONLINE SERVICES": "SERVICE",
    "DELAY IN CONNECTING DEALER": "SERVICE",
    "SETTLEMENT PROCESS": "PROCESS",
    "SEGMENT ACTIVATION": "PROCESS",
    "PRODUCT ACTIVATION": "PROCESS",
    "ECS MANDATE": "PROCESS",
    "KYC UPDATION": "PROCESS",
    "FINANCIAL UPDATION": "PROCESS",
    "SLBM": "PRODUCT",
    "IPO": "PRODUCT",
    "TSLO": "PRODUCT",
    "GTC": "PRODUCT",
    "BASKET ORDER": "PRODUCT",
    "LACK OF PRODUCT KNOWLEDGE": "PEOPLE",
    "LACK OF PROCESS KNOWLEDGE": "PEOPLE",
    "LACK KNOWLEDGE OF FNO PRODUCTS": "PEOPLE"
}


for i in word_dict:
  df_test_pred.loc[(df_test_pred['x'].str.contains(i), 'y_pred')] = word_dict[i]

Unnamed: 0,Responded on,Flag,Primary Question: Primary Question-Based on your recent interaction,x,Customer id,User name,Reporting name,y_pred
0,7/1/2023 7:18,Detractor,1,,RT9H0,Manas Kumar Ghosh,Shilpa Bachhawat,others
1,7/1/2023 16:59,Detractor,2,ok,7ZT16,Sangeeta .,Prashant Maheshwari,others
2,7/1/2023 17:04,Passive,7,,YGAZ5,Rahul Singh,Shilpa Bachhawat,others
3,7/1/2023 17:07,Detractor,0,zero client friendly,ZTQR9,Rahul Sandeep Sawant,Kalpesh Suresh Gawli,others
4,7/1/2023 17:08,Promoter,10,,E3W39,Rakesh Sharma,Prashant Maheshwari,others
...,...,...,...,...,...,...,...,...
103,7/13/2023 19:10,Detractor,0,dont change charges without intimation or reason,M00FA,Shashikant Avsan Gautam,Prachi Mundra,charges
104,7/18/2023 16:41,Passive,8,,Z4LCT,Prachi Mundra,Ved Prakash Singh,others
105,7/18/2023 20:45,Promoter,9,other suggestions is below the money after sel...,I029H,Himani Bhatia,Prashant Maheshwari,charges
106,7/27/2023 16:46,Detractor,5,,M0P0N,Prachi Mundra,Ved Prakash Singh,others
