<a href="https://colab.research.google.com/github/Brijeshtanwar/Topic-Modeling/blob/main/sklearn_nlp_fasttext.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
pd.set_option('display.max_columns', None)
from IPython.core.interactiveshell import InteractiveShell
InteractiveShell.ast_node_interactivity = "all"

# NLP libraries
import nltk
import re
from nltk.stem import WordNetLemmatizer
from nltk.corpus import stopwords

In [2]:
nltk.download("stopwords")
nltk.download('wordnet')

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


True

[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


True

In [3]:
url = """https://raw.githubusercontent.com/Brijeshtanwar/Topic-Modeling/main/voc_data.csv"""
df = pd.read_csv(url,  encoding = 'unicode_escape')

In [4]:
df = df[['CUSTOMER COMMENT', 'Reason']]
df.columns = ['comment', 'topic']

In [5]:
# Drop na
df.dropna(axis=0, inplace=True)

In [6]:
#lower case
df['topic'] = df['topic'].str.lower()
df['topic'] = df['topic'].str.strip()
df.loc[df.topic=='not specified', 'topic'] = 'others'

In [7]:
topic_dic = {'application (web/app)': 'application',  'charges(bro./dp/lpc etc.)': 'charges', 'service':'service'
, 'others':'others', 'process':'process', 'people':'people', 'product':'product', 'research':'research'}

In [8]:
df['topic'] = df['topic'].apply(lambda x: topic_dic[x] if x in topic_dic else x)

In [9]:
# Preprocessing

In [10]:
# Remove integer comments
df = df[df['comment'].apply(lambda x: not isinstance(x, int))]
df = df[df['comment'].apply(lambda x: not isinstance(x, float))]

df.reset_index(drop=True, inplace=True)

#lower case
df['comment'] = df['comment'].str.lower()

In [11]:
def preprocess_text(text, flg_stemm=False, flg_lemm=True, lst_stopwords=None):
    ## clean (convert to lowercase and remove punctuations and  characters and then strip)
    text = re.sub(r"[^A-Za-z]+", ' ' , str(text).lower().strip())

    ## Tokenize (convert from string to list)
    lst_text = text.split()    ## remove Stopwords
    # if lst_stopwords is not None:
    #     lst_text = [word for word in lst_text if word not in lst_stopwords]

    ## Stemming (remove -ing, -ly, ...)
    if flg_stemm == True:
        ps = nltk.stem.porter.PorterStemmer()
        lst_text = [ps.stem(word) for word in lst_text]

    ## Lemmatisation (convert the word into root word)
    if flg_lemm == True:
        lem = nltk.stem.wordnet.WordNetLemmatizer()
        lst_text = [lem.lemmatize(word) for word in lst_text]

    ## back to string from list
    text = " ".join(lst_text)
    return text

In [12]:
lst_stopwords = nltk.corpus.stopwords.words("english")
lst_stopwords.remove('no')
lst_stopwords.remove('not')

In [13]:
df["x"] = df["comment"].apply(lambda x: preprocess_text(x, flg_stemm=False, flg_lemm=False, lst_stopwords=lst_stopwords))
# df['x'] = df['comment']
df['y'] = df['topic']

In [14]:
df = df[['x', 'y']]

In [15]:
# Prefixing each row of the category column with '__label__'
df.iloc[:, 1] = df.iloc[:, 1].apply(lambda x: '__label__' + x)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df.iloc[:, 1] = df.iloc[:, 1].apply(lambda x: '__label__' + x)


In [16]:
from sklearn.model_selection import train_test_split
df_train, df_test = train_test_split(df, test_size=0.01, stratify=df['y'], random_state=324)

In [17]:
import csv

In [18]:
pip install fasttext




In [19]:
import fasttext
# Saving the CSV file as a text file to train/test the classifier
df_train[['y', 'x']].to_csv('train.txt',
                                          index = False,
                                          sep = ' ',
                                          header = None,
                                          quoting = csv.QUOTE_NONE,
                                          quotechar = "",
                                          escapechar = " ")

df_test[['y', 'x']].to_csv('test.txt',
                                     index = False,
                                     sep = ' ',
                                     header = None,
                                     quoting = csv.QUOTE_NONE,
                                     quotechar = "",
                                     escapechar = " ")


# Training the fastText classifier
model = fasttext.train_supervised('train.txt',lr=0.5, epoch=25, wordNgrams=2, bucket=200000, dim=50)

# Evaluating performance on the entire test file
model.test('test.txt')

# Predicting on a single input
# model.predict(ds.iloc[2, 0])

# Save the trained model
# model.save_model('model.bin')
# model = fasttext.load_model("model.bin")

(50, 0.86, 0.86)

In [20]:
# from google.colab import drive
# drive.mount('/content/drive')
# model_path = "/content/drive/MyDrive/fasttext/fasttext_model_jun.bin"

# model = fasttext.load_model("fasttext_model_jun.bin")

In [21]:
url = """https://raw.githubusercontent.com/Brijeshtanwar/Topic-Modeling/main/DS-July'23%20NPS%20Data.csv"""
df = pd.read_csv(url,  encoding = 'unicode_escape')

In [22]:
df.rename(columns={'Primary Comment: Do you have any other suggestion or feedback for us?':'x'},inplace=True)

In [23]:
df["x"] = df["x"].apply(lambda x: preprocess_text(x, flg_stemm=False, flg_lemm=False, lst_stopwords=lst_stopwords))

In [24]:
df['y_pred'] = df['x'].apply(model.predict)
df['y_pred']=df['y_pred'].str[0:1]
df['y_pred'] = df['y_pred'].astype(str)
df['y_pred'] = df['y_pred'].str[12:-5]
# df_test_pred['y'] = df_test_pred['y'].str[9:]

In [25]:
# Manual keyword mapping
word_dict = {
    'product': 'product',
    'charge': 'charges',
    'brokerage': 'charges',
    'brokerages': 'charges',
    'charges': 'charges',
    'staff': 'people',
    'employees': 'people',
    'employee': 'people',
    'dealer': 'people',
    'delar': 'people',
    ' rm ': 'people',
    'relation': 'people',
    'manager': 'people',
    ' rmo ': 'people',
    'managers': 'people'
}

for i in word_dict:
  df.loc[(df['x'].str.contains(i), 'y_pred')] = word_dict[i]

In [26]:
# keyword mapping for others:
keyword_class_mapping = {
    "SLOW": "application",
    "GLITCH": "application",
    "UI": "application",
    "FEEDS ISSUE": "application",
    "PORTFOLIO ISSUE": "application",
    "ORDER EXECUTION": "application",
    "REPORTS ISSUE": "application",
    "FEATURES-REALTIME CHART MISSING": "application",
    "DELAY MARGIN UPDATION": "application",
    "ROI": "charges",
    "BROK": "charges",
    "DP": "charges",
    "SMALLCASE": "charges",
    "RESEARCH RECOMMENDATION": "service",
    "DELAY IN CONNECTING CS": "service",
    "PAYIN/PAYOUT": "service",
    "ONLINE serviceS": "service",
    "DELAY IN CONNECTING DEALER": "service",
    "SETTLEMENT process": "process",
    "SEGMENT ACTIVATION": "process",
    "product ACTIVATION": "process",
    "ECS MANDATE": "process",
    "KYC UPDATION": "process",
    "FINANCIAL UPDATION": "process",
    "SLBM": "product",
    "IPO": "product",
    "TSLO": "product",
    "GTC": "product",
    "BASKET ORDER": "product",
    "LACK OF product KNOWLEDGE": "people",
    "LACK OF process KNOWLEDGE": "people",
    "LACK KNOWLEDGE OF FNO productS": "people"
}
for i in word_dict:
  df.loc[((df['x'].str.contains(i) & df['y_pred']=='others'), 'y_pred')] = word_dict[i]

In [27]:
## Sentiment analysis

In [28]:
import torch

In [29]:
# !pip install transformers




In [30]:
from transformers import AutoModelForSequenceClassification
from transformers import TFAutoModelForSequenceClassification
from transformers import AutoTokenizer, AutoConfig
import numpy as np
from scipy.special import softmax
# Preprocess text (username and link placeholders)

MODEL = f"cardiffnlp/twitter-roberta-base-sentiment-latest"
tokenizer = AutoTokenizer.from_pretrained(MODEL)
config = AutoConfig.from_pretrained(MODEL)
# PT
model = AutoModelForSequenceClassification.from_pretrained(MODEL)
#model.save_pretrained(MODEL)

Some weights of the model checkpoint at cardiffnlp/twitter-roberta-base-sentiment-latest were not used when initializing RobertaForSequenceClassification: ['roberta.pooler.dense.bias', 'roberta.pooler.dense.weight']
- This IS expected if you are initializing RobertaForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing RobertaForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


In [31]:
def sentiment_score(text):
  tokens = tokenizer.encode(text, return_tensors='pt')
  result = model(tokens)
  return int(torch.argmax(result.logits))


In [32]:
df['sentiment'] = df['x'].apply(lambda x: sentiment_score(x[:512]))
mappings = {0:'Negative', 1:'Neutral', 2:'Positive'}
df['sentiment'] = df['sentiment'].replace(mappings)

In [33]:
from google.colab import data_table
data_table.DataTable(df[['x','sentiment']], include_index=False, num_rows_per_page=30)

Unnamed: 0,x,sentiment
0,,Neutral
1,ok,Neutral
2,,Neutral
3,zero client friendly,Negative
4,,Neutral
...,...,...
103,dont change charges without intimation or reason,Neutral
104,,Neutral
105,other suggestions is below the money after sel...,Negative
106,,Neutral


In [34]:
## Summarization

In [35]:
!pip install -q transformers einops accelerate langchain bitsandbytes

In [36]:
!huggingface-cli login


    _|    _|  _|    _|    _|_|_|    _|_|_|  _|_|_|  _|      _|    _|_|_|      _|_|_|_|    _|_|      _|_|_|  _|_|_|_|
    _|    _|  _|    _|  _|        _|          _|    _|_|    _|  _|            _|        _|    _|  _|        _|
    _|_|_|_|  _|    _|  _|  _|_|  _|  _|_|    _|    _|  _|  _|  _|  _|_|      _|_|_|    _|_|_|_|  _|        _|_|_|
    _|    _|  _|    _|  _|    _|  _|    _|    _|    _|    _|_|  _|    _|      _|        _|    _|  _|        _|
    _|    _|    _|_|      _|_|_|    _|_|_|  _|_|_|  _|      _|    _|_|_|      _|        _|    _|    _|_|_|  _|_|_|_|
    
    A token is already saved on your machine. Run `huggingface-cli whoami` to get more information or `huggingface-cli logout` if you want to log out.
    Setting a new token will erase the existing one.
    To login, `huggingface_hub` requires a token generated from https://huggingface.co/settings/tokens .
Token: 
Add token as git credential? (Y/n) Y
Token is valid (permission: read).
[1m[31mCannot authenticate throu

In [37]:
# pip install accelerate



In [38]:
# ! pip install torch==1.13.1


In [39]:
from langchain import HuggingFacePipeline
from transformers import AutoTokenizer
import transformers
import torch

model = "meta-llama/Llama-2-7b-chat-hf"

tokenizer = AutoTokenizer.from_pretrained(model)

pipeline = transformers.pipeline(
    "text-generation", #task
    model=model,
    tokenizer=tokenizer,
    torch_dtype=torch.bfloat16,
    trust_remote_code=True,
    device_map="auto",
    max_length=1000,
    do_sample=True,
    top_k=10,
    num_return_sequences=1,
    eos_token_id=tokenizer.eos_token_id
)

Downloading (…)fetensors.index.json:   0%|          | 0.00/26.8k [00:00<?, ?B/s]

Downloading shards:   0%|          | 0/2 [00:00<?, ?it/s]

Downloading (…)of-00002.safetensors:   0%|          | 0.00/9.98G [00:00<?, ?B/s]

Downloading (…)of-00002.safetensors:   0%|          | 0.00/3.50G [00:00<?, ?B/s]

Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]

Downloading (…)neration_config.json:   0%|          | 0.00/188 [00:00<?, ?B/s]

Xformers is not installed correctly. If you want to use memory_efficient_attention to accelerate training use the following command to install Xformers
pip install xformers.


In [40]:
llm = HuggingFacePipeline(pipeline = pipeline, model_kwargs = {'temperature':0.2})

In [41]:
from langchain import PromptTemplate,  LLMChain

template = """
              Write a concise summary of the following text delimited by triple backquotes.
              Return your response in bullet points which covers the key points of the text.
              ```{text}```
              BULLET POINT SUMMARY:
           """

prompt = PromptTemplate(template=template, input_variables=["text"])

llm_chain = LLMChain(prompt=prompt, llm=llm)

In [45]:
df.groupby(['sentiment', 'y_pred']).

Unnamed: 0,Responded on,Flag,Primary Question: Primary Question-Based on your recent interaction,x,Customer id,User name,Reporting name,y_pred,sentiment
0,7/1/2023 7:18,Detractor,1,,RT9H0,Manas Kumar Ghosh,Shilpa Bachhawat,others,Neutral
1,7/1/2023 16:59,Detractor,2,ok,7ZT16,Sangeeta .,Prashant Maheshwari,others,Neutral
2,7/1/2023 17:04,Passive,7,,YGAZ5,Rahul Singh,Shilpa Bachhawat,others,Neutral
3,7/1/2023 17:07,Detractor,0,zero client friendly,ZTQR9,Rahul Sandeep Sawant,Kalpesh Suresh Gawli,others,Negative
4,7/1/2023 17:08,Promoter,10,,E3W39,Rakesh Sharma,Prashant Maheshwari,others,Neutral
...,...,...,...,...,...,...,...,...,...
103,7/13/2023 19:10,Detractor,0,dont change charges without intimation or reason,M00FA,Shashikant Avsan Gautam,Prachi Mundra,charges,Neutral
104,7/18/2023 16:41,Passive,8,,Z4LCT,Prachi Mundra,Ved Prakash Singh,others,Neutral
105,7/18/2023 20:45,Promoter,9,other suggestions is below the money after sel...,I029H,Himani Bhatia,Prashant Maheshwari,charges,Negative
106,7/27/2023 16:46,Detractor,5,,M0P0N,Prachi Mundra,Ved Prakash Singh,others,Neutral


In [43]:
text = ". ".join(df['x'])
text = text[:1000]

In [44]:
print(llm_chain.run(text))

 • "Ok. Thank you, Kotak team, for providing good family"
            • "Very good knowledgeable person"
            • "I liked Kotak Neo very much"
            • "Chat is not opening in my phone properly"
            • "Need a lot of knowledge improvement"
            • "Brokerage charges are too high"
            • "Response time should be shorter"
            • "Excellent job"
            • "Please provide a smooth and fast platform"
            • "I want some Kannada language knowing people to guide me"
            • "Customer service is not helpful"
            • "Executives are reluctant to admit technical glitches"
            • "I have applied for an IPO but there is no communication from the exchange"
            • "It took hours to respond to my query"
            • "Unable to place buy orders in my account"
            • "Account is blocked for buying due to lack of pan and adhar linking"
            • "I received a response after market hours"
            • "Kotak Neo is a 