In [1]:
import pandas as pd
pd.options.mode.chained_assignment = None
import numpy as np

In [2]:
df=pd.read_excel('Automatic_Ticket_Assignment.xlsx')
df.columns=['ShortDescription','Description', 'Caller', 'AssignmentGroup']

In [3]:
group_counts = df['AssignmentGroup'].value_counts()
to_remove = group_counts[group_counts < 200].index
df = df[~df['AssignmentGroup'].isin(to_remove)]

In [4]:
df['Description'] = df.apply(lambda row: row['ShortDescription'] if pd.isna(row['Description']) else row['Description'], axis=1)
df['ShortDescription'] = df.apply(lambda row: row['Description'] if pd.isna(row['ShortDescription']) else row['ShortDescription'], axis=1)

In [5]:
import contractions

def apply_contractions(text):
    new_phrase = []
    for word in text.split():
        new_phrase.append(contractions.fix(word))
        
    return ' '.join(new_phrase)
        
# Expanding Contractions in the reviews
df['Description']=df['Description'].apply(lambda x:apply_contractions(x))
df['ShortDescription']=df['ShortDescription'].apply(lambda x:apply_contractions(x))

In [6]:
import nltk
from nltk.corpus import stopwords
import re

stop_words = set(stopwords.words('english'))

def text_cleaner(text):
    #converting to lowercase
    newString = text.lower()
    #removing links
    newString = re.sub(r'(https|http)?:\/\/(\w|\.|\/|\?|\=|\&|\%)*\b', '', newString) 
    #fetching alphabetic characters
    newString = re.sub("[^a-zA-Z]", " ", newString)
    #removing stop words
    tokens = [w for w in newString.split() if not w in stop_words] 
    long_words=[]
    for i in tokens:
        #removing short words
        if len(i)>1:                                                 
            long_words.append(i)   
    return (" ".join(long_words)).strip()

cleaned_text=[]
for i in df['Description']:
    cleaned_text.append(text_cleaner(i))

df['Cleaned_Description'] = cleaned_text

cleaned_text=[]
for i in df['ShortDescription']:
    cleaned_text.append(text_cleaner(i))

df['ShortDescription'] = cleaned_text

In [7]:
df['Consolidated_Text']=df['Cleaned_Description'] + ' ' + df['ShortDescription']

In [8]:
from nltk.tokenize import word_tokenize

# function to genarate word tokens for tokenizers

def tokenization_func(text):
        return word_tokenize(text)

list_for_sentence_word_tokens = []

for sen in df.Consolidated_Text:
    list_for_sentence_word_tokens.append(tokenization_func(sen))

In [9]:
# Part Of Speech Tagging

list_of_sen_with_part_of_speech_tagging = []
for sen_list in list_for_sentence_word_tokens:
    list_of_sen_with_part_of_speech_tagging.append(nltk.pos_tag(sen_list))

In [10]:
# Lemmatization

from nltk.stem import WordNetLemmatizer
from nltk.corpus import wordnet

# Map POS tags to wordnet tags
# This step is necessary because the lemmatizer requires WordNet tags instead of POS tags
wordnet_tags = {'N': wordnet.NOUN, 'V': wordnet.VERB, 'R': wordnet.ADV, 'J': wordnet.ADJ}

lemmatizer = WordNetLemmatizer()

# Normalize the words using lemmatization with the appropriate POS tags
list_of_lemmatized_sen = []
for sen_list in list_of_sen_with_part_of_speech_tagging:
    lemmas = []
    for word, pos in sen_list:
        if pos[0] in wordnet_tags:
            tag = wordnet_tags[pos[0]]
            lemma = lemmatizer.lemmatize(word, tag)
            lemmas.append(lemma)
        else:
            lemmas.append(word)

    # Join the lemmas back into a normalized sentence
    normalized_sentence = " ".join(lemmas)
    # insert the lemmatized(normalized_sentence) sentence in a new list called list_of_lemmatized_sen
    list_of_lemmatized_sen.append(normalized_sentence)

df['Consolidated_Text'] = list_of_lemmatized_sen

In [11]:
new_df = df[['Consolidated_Text','AssignmentGroup']]
new_df.columns=['text','label']

In [12]:
new_df.drop_duplicates(subset='text', keep='first', inplace=True)

In [13]:
from sklearn import preprocessing

def labelencoder(dataframe) : 
  label_encoder = preprocessing.LabelEncoder() 
  dataframe= label_encoder.fit_transform(dataframe)
  
  return dataframe

new_df['label'] = labelencoder(new_df['label'])

In [18]:
from nltk.sentiment.vader import SentimentIntensityAnalyzer

# nltk.download('vader_lexicon')

analyzer = SentimentIntensityAnalyzer()

# Assuming your DataFrame is called new_df
new_text = []
for text in new_df['text']:
    sentiment_score = analyzer.polarity_scores(text)
    
    # Determine the highest sentiment category
    max_sentiment = max(sentiment_score, key=sentiment_score.get)
    
    # Map the sentiment category to its label
    if max_sentiment == 'neg':
        sentiment_label = 'negative'
    elif max_sentiment == 'neu':
        sentiment_label = 'neutral'
    else:
        sentiment_label = 'positive'
    
    # Append the sentiment label to the text
    text_with_sentiment = f"{sentiment_label} {text}"
    new_text.append(text_with_sentiment)
#     print(text_with_sentiment)

# print(sentiments_df[:10])

[nltk_data] Downloading package vader_lexicon to C:\Users\Devesh
[nltk_data]     Udhin\AppData\Roaming\nltk_data...
[nltk_data]   Package vader_lexicon is already up-to-date!


In [19]:
new_df['text'] = new_text