In [1]:
#importing libraries
import pandas as pd #to handle data
import numpy as np # for basic statistical operations
import warnings
import warnings
warnings.filterwarnings("ignore")

In [2]:
#loading the dataset
df = pd.read_csv('/home/ANANT/ajadhav/CategorizeUserReviews/out.csv',sep='delimiter')

In [3]:
#renaming the column name
df.columns = ['Text']
#removing the numbers
df.Text = df.Text.str.replace('\d+', '')
#removing the tab signs
df.Text = df.Text.map(lambda x: x.lstrip('\t').rstrip('aAbBcC'))

In [4]:
#creating disctinary to get cosine distance
Categories = ['costly expensive high big','easy to use feasible adaptable','good quality best awesome better','cheap low economical affordable']

#column names to additional columns
CategoryCol = ['Costly','Easy_to_Use','Good_Quality','Cheap']

In [5]:
#snippet of the dataset
df.head()

Unnamed: 0,Text
0,Very oily and creamy. Not at all what I expect...
1,This palette was a decent price and I was look...
2,The texture of this concealer pallet is fantas...
3,I really can't tell what exactly this thing is...
4,"It was a little smaller than I expected, but t..."


In [6]:
#this function is calculation cosine distance from each category to review
import re, math
from collections import Counter

WORD = re.compile(r'\w+')

def get_cosine(vec1, vec2):
     intersection = set(vec1.keys()) & set(vec2.keys())
     numerator = sum([vec1[x] * vec2[x] for x in intersection])

     sum1 = sum([vec1[x]**2 for x in vec1.keys()])
     sum2 = sum([vec2[x]**2 for x in vec2.keys()])
     denominator = math.sqrt(sum1) * math.sqrt(sum2)

     if not denominator:
        return 0.0
     else:
        return float(numerator) / denominator

def text_to_vector(text):
     words = WORD.findall(text)
     return Counter(words)


def LoopCalc(Temp):
    vector1 = text_to_vector(Temp)
    Cat0 = text_to_vector(Categories[0])
    Cat1 = text_to_vector(Categories[1])
    Cat2 = text_to_vector(Categories[2])
    Cat3 = text_to_vector(Categories[3])

    cosine0 = get_cosine(vector1, Cat0)
    cosine1 = get_cosine(vector1, Cat1)
    cosine2 = get_cosine(vector1, Cat2)
    cosine3 = get_cosine(vector1, Cat3)
    return cosine0,cosine1,cosine2,cosine3



In [7]:
#converting output to a dataframe
StackValues =  df.Text.apply(lambda x: LoopCalc(x))

In [8]:
#merging the ooutput to text data
CosineProbabilites = pd.DataFrame(np.column_stack(list(zip(*StackValues))), columns=CategoryCol)
ReviewWithCosine = pd.concat([df, CosineProbabilites], axis=1, sort=False)

In [9]:
#Some reviews are neutral means they are not falling in any category
ReviewWithCosine['Neutral'] = ReviewWithCosine.iloc[:,1:].sum(axis=1)<=0
ReviewWithCosine.Neutral = ReviewWithCosine.Neutral*1

In [10]:
#snippet of final dataset
ReviewWithCosine.head()

Unnamed: 0,Text,Costly,Easy_to_Use,Good_Quality,Cheap,Neutral
0,Very oily and creamy. Not at all what I expect...,0.0,0.2,0.0,0.0,0
1,This palette was a decent price and I was look...,0.0,0.0,0.0,0.0,1
2,The texture of this concealer pallet is fantas...,0.0,0.081349,0.027116,0.0,0
3,I really can't tell what exactly this thing is...,0.0,0.0,0.0,0.0,1
4,"It was a little smaller than I expected, but t...",0.0,0.08,0.0,0.044721,0


In [11]:
#converting cosine distances to binary format ie. 1 and zero
b = np.zeros_like(ReviewWithCosine.iloc[:,1:6])
b[np.arange(len(ReviewWithCosine.iloc[:,1:6])), ReviewWithCosine.iloc[:,1:6].values.argmax(1)] = 1
ReviewWithCosine.iloc[:,1:6] = b

In [12]:
#updated snippte of the dataset
ReviewWithCosine.head()

Unnamed: 0,Text,Costly,Easy_to_Use,Good_Quality,Cheap,Neutral
0,Very oily and creamy. Not at all what I expect...,0.0,1.0,0.0,0.0,0.0
1,This palette was a decent price and I was look...,0.0,0.0,0.0,0.0,1.0
2,The texture of this concealer pallet is fantas...,0.0,1.0,0.0,0.0,0.0
3,I really can't tell what exactly this thing is...,0.0,0.0,0.0,0.0,1.0
4,"It was a little smaller than I expected, but t...",0.0,1.0,0.0,0.0,0.0


In [13]:
#saving categoriees and text to csv file
ReviewWithCosine.to_csv('ReviewsWithCategory.csv',index=False)

In [14]:
#converting text data into numeric format
from sklearn.feature_extraction.text import TfidfVectorizer
from nltk.corpus import stopwords
stop = set(stopwords.words('english'))
transformer  = TfidfVectorizer(lowercase= True,stop_words=stop,max_features=500)
X = transformer.fit_transform(ReviewWithCosine.Text)

In [15]:
#splitting into train and test 75 and 25%
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X,ReviewWithCosine.iloc[:,1:6],test_size = 0.25,
                                                   random_state = 42)


In [16]:
#building a classification model
from sklearn.ensemble import RandomForestClassifier
model = RandomForestClassifier()

In [17]:
#fiting our data on the model
model.fit(X_train,y_train)

RandomForestClassifier(bootstrap=True, class_weight=None, criterion='gini',
            max_depth=None, max_features='auto', max_leaf_nodes=None,
            min_impurity_decrease=0.0, min_impurity_split=None,
            min_samples_leaf=1, min_samples_split=2,
            min_weight_fraction_leaf=0.0, n_estimators=10, n_jobs=None,
            oob_score=False, random_state=None, verbose=0,
            warm_start=False)

In [18]:
#predicting model on test data
y_pred = model.predict(X_test)

In [19]:
#validating the results with confusion matrix
from sklearn.metrics import confusion_matrix,accuracy_score,f1_score
confusion_matrix(y_test.values.argmax(axis=1),y_pred.argmax(axis=1))

array([[  889,   486,    12,     0,     5],
       [ 2908, 29783,   509,    30,  1370],
       [ 1598,  1738,  1687,     3,    17],
       [  205,   222,    21,   180,    37],
       [ 1659,  2720,   161,     9,  3377]])

In [20]:
#validating the results with Accuracy

print(accuracy_score(y_test.values.argmax(axis=1),y_pred.argmax(axis=1)))

0.7237335267803168


In [21]:
#validating the results with F1 Score

print(f1_score(y_test.values.argmax(axis=1),y_pred.argmax(axis=1),average='weighted'))

0.7391932578967094


In [23]:
# Put your new review here to validate
new_review = '01Very oily and creamy. Not at all what I expected... ordered this to try to highlight and contour and it just looked awful!!! Plus, took FOREVER to arrive.Dont waste your money'

In [24]:
#pre processing and converting to numeric format
new_review = ''.join([i for i in new_review if not i.isdigit()])
new_review = re.sub(r'[^\w\s]','',new_review)
new_review=[new_review]
new_test = transformer.transform(new_review)

In [25]:
#printing predictions
print('Predicted category for given review is:- ',CategoryCol[model.predict(new_test).argmax(axis=1).item()])


('Predicted category for given review is:- ', 'Easy_to_Use')
