# By [OTP];


# Importaciones basicas

In [1]:
import pandas as pd #manipular data
from imblearn.under_sampling import RandomUnderSampler#bajar data grande para igualarla a la pequeña

## Informacion del DataSet 
La data esta compuesta por dos columnas:

- Reviews de peliculas
- Sentimiento del comentario: Positivo / Negativo

# 1 Preparar Data

1.1 Leer Dataset

In [2]:
df_review = pd.read_csv('IMDB Dataset.csv/IMDB Dataset.csv')
df_review

Unnamed: 0,review,sentiment
0,One of the other reviewers has mentioned that ...,positive
1,A wonderful little production. <br /><br />The...,positive
2,I thought this was a wonderful way to spend ti...,positive
3,Basically there's a family where a little boy ...,negative
4,"Petter Mattei's ""Love in the Time of Money"" is...",positive
...,...,...
49995,I thought this movie did a down right good job...,positive
49996,"Bad plot, bad dialogue, bad acting, idiotic di...",negative
49997,I am a Catholic taught in parochial elementary...,negative
49998,I'm going to have to disagree with the previou...,negative


 # 1.2 DataSet Desbalanceado 

In [4]:
#creo un dataframe con sentimiento positivo
df_positivo = df_review[df_review['sentiment']=='positive'][:9000]
#creo un dataframe con sentimiento negativo
df_negativo = df_review[df_review['sentiment']=='negative'][:1000]


#creo otro dataframe nuevo que es la concatenacion de los dos anteriores
df_review_des = pd.concat([df_positivo, df_negativo])
df_review_des

#el dataframe nuevo es el dataframe original pero reducido y desbalanceado 

Unnamed: 0,review,sentiment
0,One of the other reviewers has mentioned that ...,positive
1,A wonderful little production. <br /><br />The...,positive
2,I thought this was a wonderful way to spend ti...,positive
4,"Petter Mattei's ""Love in the Time of Money"" is...",positive
5,"Probably my all-time favorite movie, a story o...",positive
...,...,...
2000,Stranded in Space (1972) MST3K version - a ver...,negative
2005,"I happened to catch this supposed ""horror"" fli...",negative
2007,waste of 1h45 this nasty little film is one to...,negative
2010,Warning: This could spoil your movie. Watch it...,negative


In [5]:
df_review_des.value_counts('sentiment')

sentiment
positive    9000
negative    1000
Name: count, dtype: int64

 # 1.3 DataSet Balanceado 

In [6]:
rus = RandomUnderSampler()
df_review_bal, df_review_bal['sentiment'] = rus.fit_resample(df_review_des[['review']],
                                                             df_review_des['sentiment'])
df_review_bal

Unnamed: 0,review,sentiment
0,Basically there's a family where a little boy ...,negative
1,"This show was an amazing, fresh & innovative i...",negative
2,Encouraged by the positive comments about this...,negative
3,Phil the Alien is one of those quirky films wh...,negative
4,I saw this movie when I was about 12 when it c...,negative
...,...,...
1995,Recap: Based on the true story of Charlie Wils...,positive
1996,As soon as the credits rolled on Saturday nigh...,positive
1997,i originally seen the flash Gordon serial on P...,positive
1998,The first Disney animated film without the str...,positive


In [7]:
#corroboramos que la data esta balanceada:

df_review_bal.value_counts(['sentiment'])

sentiment
negative     1000
positive     1000
Name: count, dtype: int64

# 2 Separando data para entrenar y testear


In [8]:
from sklearn.model_selection import train_test_split

train, test = train_test_split(df_review_bal, test_size=0.3, random_state=42)


In [9]:
train_x, train_y = train['review'], train['sentiment']
test_x, test_y = test['review'], test['sentiment']

# 3 Vectorizamos el texto (Bag of Words)

In [10]:
from sklearn.feature_extraction.text import TfidfVectorizer

tfidf = TfidfVectorizer(stop_words='english')# encuentra los mejores parametros para la data
train_x_vector = tfidf.fit_transform(train_x)# aplicamos esos mejores parametros a la data

test_x_vector = tfidf.transform(test_x)


In [11]:
train_x_vector

<1400x20801 sparse matrix of type '<class 'numpy.float64'>'
	with 121882 stored elements in Compressed Sparse Row format>

# 4 Seleccion de Modelo
### (Aprendizaje supervisado)

### 4.1 Maquinas de soporte vectorial (SVM)

In [13]:
from sklearn.svm import SVC

svc = SVC()
svc.fit(train_x_vector, train_y)

### 4.1.1Testeo del modelo

In [15]:
print(svc.predict(tfidf.transform(['A good movie'])))
print(svc.predict(tfidf.transform(['An excellent movie'])))
print(svc.predict(tfidf.transform(['"I did not like this movie at all I gave this movie away"'])))

['positive']
['positive']
['negative']


### 4.2 Arbol de decision

In [16]:
from sklearn.tree import DecisionTreeClassifier

dec_tree = DecisionTreeClassifier()
dec_tree.fit(train_x_vector, train_y)

### 4.2.1Testeo del modelo

In [17]:
print(dec_tree.predict(tfidf.transform(['A good movie'])))
print(dec_tree.predict(tfidf.transform(['An excellent movie'])))
print(dec_tree.predict(tfidf.transform(['"I did not like this movie at all I gave this movie away"'])))

['positive']
['positive']
['positive']


### 4.3  Naive Bayes

In [21]:
from sklearn.naive_bayes import GaussianNB

gnb = GaussianNB()
gnb.fit(train_x_vector.toarray(), train_y)

### 4.4 Logistic Regression

In [18]:
from sklearn.linear_model import LogisticRegression

log_reg = LogisticRegression()
log_reg.fit(train_x_vector, train_y)

### 4.4.1Testeo del modelo

In [19]:
print(log_reg.predict(tfidf.transform(['A good movie'])))
print(log_reg.predict(tfidf.transform(['An excellent movie'])))
print(log_reg.predict(tfidf.transform(['"I did not like this movie at all I gave this movie away"'])))

['negative']
['positive']
['negative']


# 5 Evaluacion del modelo

### 5.1 Accuracy del modelo

In [30]:
print(svc.score(test_x_vector,test_y))
print(dec_tree.score(test_x_vector,test_y))
print(gnb.score(test_x_vector.toarray(),test_y))
print(log_reg.score(test_x_vector,test_y))

0.815
0.66
0.6016666666666667
0.815


### Dados estos resultados, obtenermos que:

SVM y Logistic Regression, son los modelos con mejor exactitud, con lo cual podemos escoger por ejemplo SVM que es uno de los que puede predecir una mayor cantidad de sentimientos correctos.

### 5.2 F1 Score

F1 Score = 2*(Recall * Precision)/ (Recall + Precision)

In [34]:
from sklearn.metrics import f1_score

f1_score(test_y, svc.predict(test_x_vector), 
         labels=['positive','negative'], 
         average=None)

array([0.82182986, 0.80762565])

### 5.3 Reporte de clasificacion

In [38]:
from sklearn.metrics import classification_report

print(classification_report(test_y, svc.predict(test_x_vector), 
         labels=['positive','negative']))

#clase(objetivo a predecir, lo que predecimos con el modelo que usamos, el orden de reporte) )

              precision    recall  f1-score   support

    positive       0.79      0.86      0.82       298
    negative       0.85      0.77      0.81       302

    accuracy                           0.81       600
   macro avg       0.82      0.82      0.81       600
weighted avg       0.82      0.81      0.81       600



### 5.4 Matriz de confusion

In [39]:
from sklearn.metrics import confusion_matrix

print(confusion_matrix(test_y, svc.predict(test_x_vector), 
         labels=['positive','negative']))

[[256  42]
 [ 69 233]]


# 6 Optimizacion del modelo


### 5.1 GridSearchCV

In [42]:
from sklearn.model_selection import GridSearchCV

parametros = {'C':[1,4,8,16,32],'kernel':['linear', 'rbf']}
svc = SVC()
svc_grid = GridSearchCV(svc, parametros, cv=2)

svc_grid.fit(train_x_vector, train_y)

In [43]:
print(svc_grid.best_estimator_)
print(svc_grid.best_params_)

SVC(C=4, kernel='linear')
{'C': 4, 'kernel': 'linear'}


In [44]:
svc_grid.best_score_

0.835

#### Como se puede observar luego de la optimizacion del modelo pasamos de un 0.81 a 0.83 de acurracy