In [1]:
import pandas as pd
import nltk
from nltk.tokenize import word_tokenize

def get_data(nrows=1000):
    '''returns a DataFrame with nrows from downloaded Keggle csv in raw_data folder'''
    dataset_1 = pd.read_csv("../raw_data/dataset_1.csv", nrows=nrows)
    df = dataset_1.copy()
    return df


def clean_data(df):
    '''returns cleaned DataFrame'''
    
    # dropping redundant columns
    df_clean = df[['Negative_Review', 'Positive_Review', 'Reviewer_Score']]

    # Cleaning, merging and renaming negative and positive reviews
    df_clean.loc[:,'Negative_Review'] = df_clean.loc[:,'Negative_Review'].replace(to_replace="No Negative", value="")
    df_clean.loc[:,'Positive_Review'] = df_clean.loc[:,'Positive_Review'].replace(to_replace="No Positive", value="")
    df_clean.loc[:,"reviews"] = df_clean.loc[:,'Negative_Review'] + " " + df_clean.loc[:,'Positive_Review']
    df_clean.loc[:,"review_score"] = df_clean.loc[:,'Reviewer_Score']
    df_clean = df_clean.drop(columns=['Negative_Review', 'Positive_Review', 'Reviewer_Score'])

    # Remove reviews with less than 10 words (or signs)
    df_clean.loc[:,'length'] = df_clean['reviews'].apply(lambda x: len(word_tokenize(str(x))))
    df_clean.drop(df_clean[df_clean['length'] < 11].index, inplace=True)
    df_clean.drop(columns=['length'], inplace=True)
    df_clean.reset_index(drop=True, inplace=True)

    return df_clean

In [2]:
import pandas as pd
import numpy as np
from sklearn.base import BaseEstimator, TransformerMixin
from sklearn.feature_extraction.text import CountVectorizer

In [3]:
import pandas as pd
import string
import nltk
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from nltk.stem import WordNetLemmatizer


def custom_stopwords():
    """create custom stopwords list excluding negative words"""
    negative_words = ['no',
    'nor',
    'not',
    "don't",
    'should',
    "should've",
    'aren',
    "aren't",
    'couldn',
    "couldn't",
    'didn',
    "didn't",
    'doesn',
    "doesn't",
    'hadn',
    "hadn't",
    'hasn',
    "hasn't",
    'haven',
    "haven't",
    'isn',
    "isn't",
    "wasn't",
    'weren',
    "weren't",
    'won',
    "won't",
    'wouldn',
    "wouldn't"]

    custom_stopwords = [x for x in stopwords.words('english') if x not in negative_words]

    return custom_stopwords


def clean_for_nlp(text):
    """ preprocess review text data for nlp analysis """
    # Lower case
    text = ''.join(text)
    text = text.lower()
    # Remove numbers
    text = ''.join(word for word in text if not word.isdigit())
    # Remove punctuation
    for punctuation in string.punctuation:
        text = text.replace(punctuation, '')
    # Remove stopwords
    text = word_tokenize(text)
    stopwords = custom_stopwords()
    text = [w for w in text if not w in stopwords]
    # Lemmatizing
    lemmatizer = WordNetLemmatizer()
    text = [lemmatizer.lemmatize(word) for word in text]
    text = ' '.join(word for word in text)

    return(text)

In [4]:
class TextProcessor(BaseEstimator, TransformerMixin):
    """ Custom Transformer for text to nlp-preprocessed  """

    def __init__(self):
        self.vectorizer = CountVectorizer(dtype=np.int32)

    def fit(self, X, y=None):
        X_transformed = list(map(clean_for_nlp, X['reviews']))
        self.vectorizer.fit(X_transformed)
        return self

    def transform(self, X, y=None):
        X_transformed = list(map(clean_for_nlp, X['reviews']))
        X_vectorized = self.vectorizer.transform(X_transformed).toarray()

        return pd.DataFrame(X_vectorized)

In [None]:
!ls 

In [5]:
import joblib
from termcolor import colored
import numpy as np
import pandas as pd
from sklearn.preprocessing import FunctionTransformer
from sklearn.linear_model import LinearRegression
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer
from sklearn.model_selection import train_test_split
from sklearn.model_selection import cross_val_score


class Trainer(object):
    def __init__(self, X, y):
        """
            X: pandas DataFrame
            y: pandas Series
        """
        self.pipeline = None
        self.X = X
        self.y = y
   
    def set_pipeline(self):
        """defines the pipeline as a class attribute"""
        nlp_transformer = Pipeline([('text_preprocessor', TextProcessor())])

        preproc_pipe = ColumnTransformer([
        ('nlp_transformer', nlp_transformer, ["reviews"])], remainder="drop")

        self.pipeline = Pipeline([('preproc', preproc_pipe), ('linear_model', LinearRegression())])


    def run(self):
        self.set_pipeline()
        self.pipeline.fit(self.X, self.y)
        print("trained model")


    def evaluate(self):
        """evaluates the pipeline and returns r2"""
        pass
        #cv = cross_val_score(self.pipeline, self.X, self.y, cv=5, scoring='r2').mean()

        #return cv


    def save_model(self):
        """Save the model into a .joblib format"""
        joblib.dump(self.pipeline, 'model.joblib')
        print(colored("model.joblib saved locally", "green"))

In [7]:
N = 1000
df = get_data(nrows=N)
df = clean_data(df)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  self._setitem_single_column(ilocs[0], value, pi)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  self.obj[key] = value


In [8]:
df.head()

Unnamed: 0,reviews,review_score
0,I am so angry that i made this post available...,2.9
1,No real complaints the hotel was great great...,7.5
2,Rooms are nice but for elderly a bit difficul...,7.1
3,My room was dirty and I was afraid to walk ba...,3.8
4,You When I booked with your company on line y...,6.7


In [9]:
y = df["review_score"]
X = df.drop("review_score", axis=1)

In [10]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.1)

In [11]:
trainer = Trainer(X=X_train, y=y_train)


In [12]:
y_train

648     9.2
257     7.5
591     7.1
517    10.0
199     6.7
       ... 
162     5.0
754     7.5
493    10.0
56      7.1
559     5.4
Name: review_score, Length: 734, dtype: float64

In [13]:
trainer.run()

trained model


In [None]:
trainer.pipeline

In [None]:
preproc = trainer.pipeline.named_steps['preproc']

In [None]:
X1 = preproc.transform(X)

In [None]:
X1.shape

In [None]:
linear_model = trainer.pipeline.named_steps['linear_model']

In [None]:
len(linear_model.coef_)

In [None]:
X_train

In [14]:
trainer.pipeline.predict(X_train)

array([ 9.2,  7.5,  7.1, 10. ,  6.7,  6.7,  7.5,  5.8,  8.8,  7.9,  8.8,
        9.2, 10. , 10. ,  9.6,  7.5,  7.9,  3.8,  8.3,  7.9,  7.1,  7.9,
       10. ,  9.2, 10. ,  6.3,  9.6, 10. ,  6.3,  9.2,  8.3,  4.6,  9.2,
        7.9, 10. , 10. ,  7.9,  7.9,  8.8, 10. ,  7.5,  8.8, 10. ,  9.2,
        3.8,  7.1,  9.2,  9.2,  7.9,  5. ,  8.3,  8.8,  9.2,  9.2,  7.9,
        5.8,  9.6, 10. ,  8.8,  9.6,  5.8,  9.6,  7.5,  9.2,  9.2, 10. ,
       10. ,  7.9,  2.9,  9.6,  4.6,  8.8,  5.8,  4.2,  8.8,  8.3, 10. ,
        9.2, 10. ,  8.3,  6.3, 10. ,  8.8,  5. ,  8.3,  9.2,  9.2,  7.1,
        4.6,  8.3,  7.5,  7.1,  9.2,  5.8,  7.5,  6.3,  5.8, 10. ,  7.9,
        8.8,  9.2,  6.7,  6.3,  6.7,  8.8,  7.1,  7.9,  8.8,  8.8,  7.1,
        8.3, 10. ,  7.9,  7.5,  9.6,  9.2,  9.6,  6.7,  7.5,  9.6,  9.2,
        7.5,  5.8,  9.6,  9.6,  8.8,  8.8,  9.2,  6.7,  7.9,  7.1,  4.6,
       10. ,  5.8, 10. ,  5.8,  7.5,  9.6, 10. ,  8.8,  7.5, 10. ,  7.9,
        6.3,  6.7,  7.1, 10. ,  7.5, 10. ,  8.3,  4

In [15]:
X = pd.DataFrame({"reviews": ["hi my name is Nizar and I'm the superstar around"]})

In [None]:
X = pd.DataFrame({"reviews": ["hi my name is Kilian"]})

In [None]:
X_train.shape

In [None]:
X.shape

In [16]:
trainer.pipeline.predict(X)

array([7.74046862])

In [None]:
trainer.pipeline.predict(X)