<a href="https://colab.research.google.com/github/Brijeshtanwar/Topic-Modeling/blob/main/sklearn_nlp_fasttext.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
pd.set_option('display.max_columns', None)
from IPython.core.interactiveshell import InteractiveShell
InteractiveShell.ast_node_interactivity = "all"

# NLP libraries
import nltk
import re
from nltk.stem import WordNetLemmatizer
from nltk.corpus import stopwords

In [2]:
nltk.download("stopwords")
nltk.download('wordnet')

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.


True

[nltk_data] Downloading package wordnet to /root/nltk_data...


True

In [3]:
url = """https://raw.githubusercontent.com/Brijeshtanwar/Topic-Modeling/main/voc_data.csv"""
df = pd.read_csv(url,  encoding = 'unicode_escape')

In [4]:
df = df[['CUSTOMER COMMENT', 'Reason']]
df.columns = ['comment', 'topic']

In [5]:
# Drop na
df.dropna(axis=0, inplace=True)

In [6]:
#lower case
df['topic'] = df['topic'].str.lower()
df['topic'] = df['topic'].str.strip()
df.loc[df.topic=='not specified', 'topic'] = 'others'

In [7]:
topic_dic = {'application (web/app)': 'application',  'charges(bro./dp/lpc etc.)': 'charges', 'service':'service'
, 'others':'others', 'process':'process', 'people':'people', 'product':'product', 'research':'research'}

In [8]:
df['topic'] = df['topic'].apply(lambda x: topic_dic[x] if x in topic_dic else x)

In [9]:
# Preprocessing

In [10]:
# Remove integer comments
df = df[df['comment'].apply(lambda x: not isinstance(x, int))]
df = df[df['comment'].apply(lambda x: not isinstance(x, float))]

df.reset_index(drop=True, inplace=True)

#lower case
df['comment'] = df['comment'].str.lower()

In [11]:
def preprocess_text(text, flg_stemm=False, flg_lemm=True, lst_stopwords=None):
    ## clean (convert to lowercase and remove punctuations and  characters and then strip)
    text = re.sub(r"[^A-Za-z]+", ' ' , str(text).lower().strip())

    ## Tokenize (convert from string to list)
    lst_text = text.split()    ## remove Stopwords
    # if lst_stopwords is not None:
    #     lst_text = [word for word in lst_text if word not in lst_stopwords]

    ## Stemming (remove -ing, -ly, ...)
    if flg_stemm == True:
        ps = nltk.stem.porter.PorterStemmer()
        lst_text = [ps.stem(word) for word in lst_text]

    ## Lemmatisation (convert the word into root word)
    if flg_lemm == True:
        lem = nltk.stem.wordnet.WordNetLemmatizer()
        lst_text = [lem.lemmatize(word) for word in lst_text]

    ## back to string from list
    text = " ".join(lst_text)
    return text

In [12]:
lst_stopwords = nltk.corpus.stopwords.words("english")
lst_stopwords.remove('no')
lst_stopwords.remove('not')

In [13]:
df["x"] = df["comment"].apply(lambda x: preprocess_text(x, flg_stemm=False, flg_lemm=False, lst_stopwords=lst_stopwords))
# df['x'] = df['comment']
df['y'] = df['topic']

In [14]:
df = df[['x', 'y']]

In [15]:
# Prefixing each row of the category column with '__label__'
df.iloc[:, 1] = df.iloc[:, 1].apply(lambda x: '__label__' + x)

In [16]:
from sklearn.model_selection import train_test_split
df_train, df_test = train_test_split(df, test_size=0.01, stratify=df['y'], random_state=324)

In [17]:
import csv

In [18]:
pip install fasttext


Collecting fasttext
  Downloading fasttext-0.9.2.tar.gz (68 kB)
[?25l     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m0.0/68.8 kB[0m [31m?[0m eta [36m-:--:--[0m[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m68.8/68.8 kB[0m [31m2.3 MB/s[0m eta [36m0:00:00[0m
[?25h  Preparing metadata (setup.py) ... [?25l[?25hdone
Collecting pybind11>=2.2 (from fasttext)
  Using cached pybind11-2.11.1-py3-none-any.whl (227 kB)
Building wheels for collected packages: fasttext
  Building wheel for fasttext (setup.py) ... [?25l[?25hdone
  Created wheel for fasttext: filename=fasttext-0.9.2-cp310-cp310-linux_x86_64.whl size=4199770 sha256=0ccf41a049832b9b3d67078d8a2062c0675446799c3b92493f1c9a83c13c18cb
  Stored in directory: /root/.cache/pip/wheels/a5/13/75/f811c84a8ab36eedbaef977a6a58a98990e8e0f1967f98f394
Successfully built fasttext
Installing collected packages: pybind11, fasttext
Successfully installed fasttext-0.9.2 pybind11-2.11.1


In [58]:
import fasttext
# Saving the CSV file as a text file to train/test the classifier
df_train[['y', 'x']].to_csv('train.txt',
                                          index = False,
                                          sep = ' ',
                                          header = None,
                                          quoting = csv.QUOTE_NONE,
                                          quotechar = "",
                                          escapechar = " ")

df_test[['y', 'x']].to_csv('test.txt',
                                     index = False,
                                     sep = ' ',
                                     header = None,
                                     quoting = csv.QUOTE_NONE,
                                     quotechar = "",
                                     escapechar = " ")


# Training the fastText classifier
model = fasttext.train_supervised('train.txt',lr=0.5, epoch=25, wordNgrams=2, bucket=200000, dim=50)

# Evaluating performance on the entire test file
model.test('test.txt')

# Predicting on a single input
# model.predict(ds.iloc[2, 0])

# Save the trained model
# model.save_model('model.bin')
# model = fasttext.load_model("model.bin")

(50, 0.86, 0.86)

In [20]:
# from google.colab import drive
# drive.mount('/content/drive')
# model_path = "/content/drive/MyDrive/fasttext/fasttext_model_jun.bin"

# model = fasttext.load_model("fasttext_model_jun.bin")

In [47]:
url = """https://github.com/Brijeshtanwar/Topic-Modeling/blob/21e63d6c835e288129a2ed2447c8deb7d5841c66/NPS%20Data%20sept.csv"""
df = pd.read_csv(url)

In [49]:
df = pd.read_excel('/content/NPS Data sept.xlsx')

In [50]:
df.rename(columns={'Primary Comment: Do you have any other suggestion or feedback for us?':'x'},inplace=True)

In [53]:
df.rename(columns={'Comment':'x'}, inplace=True)

In [56]:
df["x"] = df["x"].apply(lambda x: preprocess_text(x, flg_stemm=False, flg_lemm=False, lst_stopwords=lst_stopwords))

In [59]:
df['y_pred'] = df['x'].apply(model.predict)
df['y_pred']=df['y_pred'].str[0:1]
df['y_pred'] = df['y_pred'].astype(str)
df['y_pred'] = df['y_pred'].str[12:-5]
# df_test_pred['y'] = df_test_pred['y'].str[9:]

In [60]:
# Manual keyword mapping
word_dict = {
    'product': 'product',
    'charge': 'charges',
    'brokerage': 'charges',
    'brokerages': 'charges',
    'charges': 'charges',
    'staff': 'people',
    'employees': 'people',
    'employee': 'people',
    'dealer': 'people',
    'delar': 'people',
    ' rm ': 'people',
    'relation': 'people',
    'manager': 'people',
    ' rmo ': 'people',
    'managers': 'people'
}

for i in word_dict:
  df.loc[(df['x'].str.contains(i), 'y_pred')] = word_dict[i]

In [61]:
# keyword mapping for others:
keyword_class_mapping = {
    "SLOW": "application",
    "GLITCH": "application",
    "UI": "application",
    "FEEDS ISSUE": "application",
    "PORTFOLIO ISSUE": "application",
    "ORDER EXECUTION": "application",
    "REPORTS ISSUE": "application",
    "FEATURES-REALTIME CHART MISSING": "application",
    "DELAY MARGIN UPDATION": "application",
    "ROI": "charges",
    "BROK": "charges",
    "DP": "charges",
    "SMALLCASE": "charges",
    "RESEARCH RECOMMENDATION": "service",
    "DELAY IN CONNECTING CS": "service",
    "PAYIN/PAYOUT": "service",
    "ONLINE services": "service",
    "sarvice":"service",
    "DELAY": "service",
    "process": "process",
    "SEGMENT": "process",
    "ACTIVATION": "process",
    "ECS MANDATE": "process",
    "KYC UPDATION": "process",
    "FINANCIAL UPDATION": "process",
    "SLBM": "product",
    "IPO": "product",
    "TSLO": "product",
    "GTC": "product",
    "BASKET":"product",
    "ORDER": "product",
    "person":"people",
    "knowledge":"people",
    "knowledgable":"people",
    "advice":"people"
}
for i in keyword_class_mapping:
  df.loc[(df['x'].str.contains(i)) & (df['y_pred']=='others'), 'y_pred'] = keyword_class_mapping[i]

In [62]:
## Sentiment analysis

In [63]:
import torch

In [64]:
!pip install transformers




In [65]:
from transformers import AutoModelForSequenceClassification
from transformers import TFAutoModelForSequenceClassification
from transformers import AutoTokenizer, AutoConfig
import numpy as np
from scipy.special import softmax
# Preprocess text (username and link placeholders)

MODEL = f"cardiffnlp/twitter-roberta-base-sentiment-latest"
tokenizer = AutoTokenizer.from_pretrained(MODEL)
config = AutoConfig.from_pretrained(MODEL)
# PT
model = AutoModelForSequenceClassification.from_pretrained(MODEL)
#model.save_pretrained(MODEL)

Some weights of the model checkpoint at cardiffnlp/twitter-roberta-base-sentiment-latest were not used when initializing RobertaForSequenceClassification: ['roberta.pooler.dense.bias', 'roberta.pooler.dense.weight']
- This IS expected if you are initializing RobertaForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing RobertaForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


In [66]:
def sentiment_score(text):
  tokens = tokenizer.encode(text, return_tensors='pt')
  result = model(tokens)
  return int(torch.argmax(result.logits))


In [67]:
df['sentiment'] = df['x'].apply(lambda x: sentiment_score(x[:512]))
mappings = {0:'Negative', 1:'Neutral', 2:'Positive'}
df['sentiment'] = df['sentiment'].replace(mappings)

In [78]:
from google.colab import data_table
data_table.DataTable(df, include_index=False, num_rows_per_page=20)

Unnamed: 0,Date,NPS,x,y_pred,sentiment,month,NPS_cat
0,2023-04-01 00:10:05,10,,others,Neutral,April,Promoter
1,2023-04-01 13:56:47,10,,others,Neutral,April,Promoter
2,2023-04-01 14:11:34,9,,others,Neutral,April,Promoter
3,2023-04-01 17:06:20,4,unsatisfactory experience in general,service,Negative,April,Detractor
4,2023-04-01 17:16:26,1,,others,Neutral,April,Detractor
...,...,...,...,...,...,...,...
3338,2023-10-01 11:36:14,0,,others,Neutral,October,Detractor
3339,2023-10-01 15:19:30,10,,others,Neutral,October,Promoter
3340,2023-10-01 18:44:50,2,,others,Neutral,October,Detractor
3341,2023-10-01 20:57:24,10,,others,Neutral,October,Promoter


In [74]:
df['month'] = df['Date'].dt.month_name()

In [76]:
df['NPS_cat'] = np.where(df['NPS']>=9, 'Promoter', np.where(df['NPS']>=7, 'Passive', 'Detractor'))

In [79]:
df.to_excel('NPS_data_sept_final.xlsx')

In [36]:
## Summarization

In [None]:
!pip install -q transformers einops accelerate langchain bitsandbytes

[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m42.2/42.2 kB[0m [31m627.8 kB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m244.2/244.2 kB[0m [31m4.4 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1.5/1.5 MB[0m [31m11.6 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m92.6/92.6 MB[0m [31m10.0 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m90.0/90.0 kB[0m [31m8.7 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m49.4/49.4 kB[0m [31m3.3 MB/s[0m eta [36m0:00:00[0m
[?25h

In [None]:
!huggingface-cli login


    _|    _|  _|    _|    _|_|_|    _|_|_|  _|_|_|  _|      _|    _|_|_|      _|_|_|_|    _|_|      _|_|_|  _|_|_|_|
    _|    _|  _|    _|  _|        _|          _|    _|_|    _|  _|            _|        _|    _|  _|        _|
    _|_|_|_|  _|    _|  _|  _|_|  _|  _|_|    _|    _|  _|  _|  _|  _|_|      _|_|_|    _|_|_|_|  _|        _|_|_|
    _|    _|  _|    _|  _|    _|  _|    _|    _|    _|    _|_|  _|    _|      _|        _|    _|  _|        _|
    _|    _|    _|_|      _|_|_|    _|_|_|  _|_|_|  _|      _|    _|_|_|      _|        _|    _|    _|_|_|  _|_|_|_|
    
    A token is already saved on your machine. Run `huggingface-cli whoami` to get more information or `huggingface-cli logout` if you want to log out.
    Setting a new token will erase the existing one.
    To login, `huggingface_hub` requires a token generated from https://huggingface.co/settings/tokens .
Token: 
Add token as git credential? (Y/n) y
Token is valid (permission: read).
[1m[31mCannot authenticate throu

In [None]:
# pip install accelerate

In [None]:
# ! pip install torch==1.13.1


In [None]:
from langchain import HuggingFacePipeline
from transformers import AutoTokenizer
import transformers
import torch

model = "meta-llama/Llama-2-7b-chat-hf"

tokenizer = AutoTokenizer.from_pretrained(model)

pipeline = transformers.pipeline(
    "text-generation", #task
    model=model,
    tokenizer=tokenizer,
    torch_dtype=torch.bfloat16,
    trust_remote_code=True,
    device_map="auto",
    max_length=1000,
    do_sample=True,
    top_k=10,
    num_return_sequences=1,
    eos_token_id=tokenizer.eos_token_id
)

Downloading (…)okenizer_config.json:   0%|          | 0.00/776 [00:00<?, ?B/s]

Downloading tokenizer.model:   0%|          | 0.00/500k [00:00<?, ?B/s]

Downloading (…)/main/tokenizer.json:   0%|          | 0.00/1.84M [00:00<?, ?B/s]

Downloading (…)cial_tokens_map.json:   0%|          | 0.00/414 [00:00<?, ?B/s]

Downloading (…)lve/main/config.json:   0%|          | 0.00/614 [00:00<?, ?B/s]

ImportError: ignored

In [None]:
llm = HuggingFacePipeline(pipeline = pipeline, model_kwargs = {'temperature':0})

In [None]:
from langchain import PromptTemplate,  LLMChain

template = """
              Write a concise summary of the following text delimited by triple backquotes.
              Return your response in 5 bullet points which covers the key points of the text.
              ```{text}```
              BULLET POINT SUMMARY:
           """

prompt = PromptTemplate(template=template, input_variables=["text"])

llm_chain = LLMChain(prompt=prompt, llm=llm)

In [None]:
for idx, data in df.groupby(['y_pred','sentiment']):
  text = ". ".join(df['x'])
  text = text[:1000]
  print(idx)
  print(llm_chain.run(text))




In [None]:
torch.cuda.empty_cache()

In [None]:
import gc
del variables
gc.collect()

In [None]:
torch.cuda.memory_summary(device=None, abbreviated=False)