### Nombre: Diego Gerardo Casados González

In [1]:
# Adaptar el txt en forma de dataframe
datos = {
    'review': [],
    'sentimiento': []
}
with open('amazon_cells_labelled.txt') as f:
    for line in f:
        temporal = line.split()
        sentimiento = temporal[-1]
        resenia =temporal[:-1]
        resenia = ' '.join(resenia)
        datos['review'].append(resenia)
        datos['sentimiento'].append(sentimiento)

import pandas as pd
df = pd.DataFrame(datos)
df['sentimiento'] = df['sentimiento'].replace({'1':'positivo','0':'negativo'})
df.head()

Unnamed: 0,review,sentimiento
0,So there is no way for me to plug it in here i...,negativo
1,"Good case, Excellent value.",positivo
2,Great for the jawbone.,positivo
3,Tied to charger for conversations lasting more...,negativo
4,The mic is great.,positivo


In [2]:
# Creamos nuestra función de preprocesamiento para limpiar el texto. 
import re
import nltk
import matplotlib.pyplot as plt
from wordcloud import WordCloud
from nltk.corpus import stopwords
from nltk import word_tokenize
stopwords = stopwords.words('english')
stopwords.extend(['s', 've','m', 'u'])
from nltk.stem import WordNetLemmatizer
lemmatizer = WordNetLemmatizer()

def prepocesamiento(texto):
    texto = texto.lower()
    texto = re.sub(r'([^A-Za-z \t])|(\d)', '', texto)
    texto = word_tokenize(texto)
    texto = [lemmatizer.lemmatize(palabra) for palabra in texto]
    texto = [palabra for palabra in texto if palabra not in stopwords]
    texto = ' '.join(texto)
    return texto
df['review_cleen'] = df['review'].apply(prepocesamiento)
df.head(10)

Unnamed: 0,review,sentimiento,review_cleen
0,So there is no way for me to plug it in here i...,negativo,way plug unless go converter
1,"Good case, Excellent value.",positivo,good case excellent value
2,Great for the jawbone.,positivo,great jawbone
3,Tied to charger for conversations lasting more...,negativo,tied charger conversation lasting minutesmajor...
4,The mic is great.,positivo,mic great
5,I have to jiggle the plug to get it to line up...,negativo,jiggle plug get line right get decent volume
6,If you have several dozen or several hundred c...,negativo,several dozen several hundred contact imagine ...
7,If you are Razr owner...you must have this!,positivo,razr owneryou must
8,"Needless to say, I wasted my money.",negativo,needle say wasted money
9,What a waste of money and time!.,negativo,waste money time


In [3]:
# A continuación se utilizará GridSearch y Cross validation con k=5 optimizado para accuracy:
# - Creamos modelo de Random Forest evaluado con 10,50 y 100 estimadores

from sklearn.model_selection import train_test_split
X = df['review_cleen']
y = df['sentimiento']
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.4, random_state=101)

In [4]:
from sklearn.pipeline import Pipeline
from sklearn.model_selection import GridSearchCV
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.ensemble import RandomForestClassifier
from sklearn.svm import SVC

pipeline= Pipeline([
    ('tfidf', TfidfVectorizer()),
    ('model',RandomForestClassifier())
])
params = {
    'tfidf__ngram_range': [(1,1),(1,2),(2,2)],
    'model__n_estimators': [10,50,100],
    'model__criterion': ['gini']
}
grid= GridSearchCV(estimator=pipeline, 
                   param_grid=params, 
                   cv=5, 
                   n_jobs=-1, 
                   verbose=3, 
                   scoring='accuracy')
grid.fit(X_train,y_train)

Fitting 5 folds for each of 9 candidates, totalling 45 fits


In [5]:
#  - Creamos el modelo SVM evaluado con C = [0.1,0.5,1] y kernel=['linear', 'rbf']

pipeline_2 = Pipeline([
    ('tfidf', TfidfVectorizer()),
    ('model',SVC())
])
params_2 = {
    'model__C': [0.1,0.5,1],
    'model__kernel': ['linear','rbf']
    
}
grid_2= GridSearchCV(estimator=pipeline_2, 
                   param_grid=params_2, 
                   cv=5, 
                   n_jobs=-1, 
                   verbose=3, 
                   scoring='accuracy')
grid_2.fit(X_train,y_train)

Fitting 5 folds for each of 6 candidates, totalling 30 fits


In [6]:
# Buscamos el mejor best_score_ y best_params_ para cada modelo
grid.best_score_

0.7533333333333333

In [11]:
grid.best_params_

{'model__criterion': 'gini',
 'model__n_estimators': 100,
 'tfidf__ngram_range': (1, 1)}

In [7]:
grid_2.best_score_

0.7816666666666666

In [8]:
grid_2.best_params_

{'model__C': 0.5, 'model__kernel': 'linear'}

### Creación del Nuevo Modelo

In [9]:
pipeline_3 = Pipeline([
    ('tfidf', TfidfVectorizer()),
    ('model',SVC(C=0.5,kernel='linear'))
])
pipeline_3.fit(X_train,y_train)

In [10]:
import joblib
joblib.dump(pipeline_3,'modeloSVC.joblib')

['modeloSVC.joblib']