In [1]:
import pandas as pd
import numpy as np

In [65]:
badmintion = "D:/my computer/Downloads/reviews_badminton/data.csv"
badmintion_df = pd.read_csv(badmintion)
badmintion_df.head()

Unnamed: 0,Reviewer Name,Review Title,Place of Review,Up Votes,Down Votes,Month,Review text,Ratings
0,Kamal Suresh,Nice product,"Certified Buyer, Chirakkal",889.0,64.0,Feb 2021,"Nice product, good quality, but price is now r...",4
1,Flipkart Customer,Don't waste your money,"Certified Buyer, Hyderabad",109.0,6.0,Feb 2021,They didn't supplied Yonex Mavis 350. Outside ...,1
2,A. S. Raja Srinivasan,Did not meet expectations,"Certified Buyer, Dharmapuri",42.0,3.0,Apr 2021,Worst product. Damaged shuttlecocks packed in ...,1
3,Suresh Narayanasamy,Fair,"Certified Buyer, Chennai",25.0,1.0,,"Quite O. K. , but nowadays the quality of the...",3
4,ASHIK P A,Over priced,,147.0,24.0,Apr 2016,Over pricedJust â?¹620 ..from retailer.I didn'...,1


In [66]:
import re
import nltk
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from nltk.stem.porter import PorterStemmer

def preprocess(text):
    if isinstance(text, str):
        text = re.sub(r'[^\w\s]', '', text) 
        text = re.sub(r'\d+', '', text)
        text = text.lower()
        stop_words = set(stopwords.words('english'))
        word_tokens = word_tokenize(text)
        clean_text = [word for word in word_tokens if word not in stop_words]

        stemmer = PorterStemmer()
        clean_text = [stemmer.stem(word) for word in clean_text]

        return clean_text
    else:
        return ''


In [67]:
badmintion_df['clean_text_stem'] = badmintion_df['Review text'].apply(preprocess) 
badmintion_df.head()

Unnamed: 0,Reviewer Name,Review Title,Place of Review,Up Votes,Down Votes,Month,Review text,Ratings,clean_text_stem
0,Kamal Suresh,Nice product,"Certified Buyer, Chirakkal",889.0,64.0,Feb 2021,"Nice product, good quality, but price is now r...",4,"[nice, product, good, qualiti, price, rise, ba..."
1,Flipkart Customer,Don't waste your money,"Certified Buyer, Hyderabad",109.0,6.0,Feb 2021,They didn't supplied Yonex Mavis 350. Outside ...,1,"[didnt, suppli, yonex, mavi, outsid, cover, yo..."
2,A. S. Raja Srinivasan,Did not meet expectations,"Certified Buyer, Dharmapuri",42.0,3.0,Apr 2021,Worst product. Damaged shuttlecocks packed in ...,1,"[worst, product, damag, shuttlecock, pack, new..."
3,Suresh Narayanasamy,Fair,"Certified Buyer, Chennai",25.0,1.0,,"Quite O. K. , but nowadays the quality of the...",3,"[quit, k, nowaday, qualiti, cork, like, year, ..."
4,ASHIK P A,Over priced,,147.0,24.0,Apr 2016,Over pricedJust â?¹620 ..from retailer.I didn'...,1,"[pricedjust, â¹, retaileri, didnt, understand,..."


In [68]:
badmintion_df['clean_text_stem'] = badmintion_df['clean_text_stem'].apply(lambda x : ' '.join(x))
badmintion_df.head()

Unnamed: 0,Reviewer Name,Review Title,Place of Review,Up Votes,Down Votes,Month,Review text,Ratings,clean_text_stem
0,Kamal Suresh,Nice product,"Certified Buyer, Chirakkal",889.0,64.0,Feb 2021,"Nice product, good quality, but price is now r...",4,nice product good qualiti price rise bad sign ...
1,Flipkart Customer,Don't waste your money,"Certified Buyer, Hyderabad",109.0,6.0,Feb 2021,They didn't supplied Yonex Mavis 350. Outside ...,1,didnt suppli yonex mavi outsid cover yonex ad ...
2,A. S. Raja Srinivasan,Did not meet expectations,"Certified Buyer, Dharmapuri",42.0,3.0,Apr 2021,Worst product. Damaged shuttlecocks packed in ...,1,worst product damag shuttlecock pack new box o...
3,Suresh Narayanasamy,Fair,"Certified Buyer, Chennai",25.0,1.0,,"Quite O. K. , but nowadays the quality of the...",3,quit k nowaday qualiti cork like year back use...
4,ASHIK P A,Over priced,,147.0,24.0,Apr 2016,Over pricedJust â?¹620 ..from retailer.I didn'...,1,pricedjust â¹ retaileri didnt understand wat a...


In [69]:
badmintion_df["label"] = badmintion_df["Ratings"].apply(lambda x : 1 if x>=3 else 0)
badmintion_df.head()
badmintion_df.dropna(subset=["clean_text_stem"], inplace=True)

badmintion_df.to_csv("temp.csv")

In [56]:
from sklearn.model_selection import train_test_split, GridSearchCV

X = badmintion_df["clean_text_stem"]
y = badmintion_df["label"]
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [8]:
import mlflow
mlflow.set_experiment("sentiment_prediction")

2024/03/25 20:24:03 INFO mlflow.tracking.fluent: Experiment with name 'sentiment_prediction' does not exist. Creating a new experiment.


<Experiment: artifact_location='file:///C:/Users/azays/Projects/Innomatics/mlruns/204075928937821731', creation_time=1711377543201, experiment_id='204075928937821731', last_update_time=1711377543201, lifecycle_stage='active', name='sentiment_prediction', tags={}>

In [9]:
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer

from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.svm import SVC
from sklearn.neighbors import KNeighborsClassifier
from sklearn.naive_bayes import GaussianNB

from sklearn.pipeline import Pipeline


In [10]:
# pipeline = Pipeline([
#     ('vocab', CountVectorizer()),
#     ('clf', RandomForestClassifier(random_state=42))
# ])

pipelines = {
    'knn' : Pipeline([
        ('vocab', CountVectorizer()),
        ('classifier', KNeighborsClassifier())
    ]), 
    'svc' : Pipeline([
        ('vocab', CountVectorizer()),
        ('classifier', SVC())
    ]),
    'logistic_regression': Pipeline([
        ('vocab', CountVectorizer()),
        ('classifier', LogisticRegression())
    ]),
    'random_forest': Pipeline([
        ('vocab', CountVectorizer()),
        ('classifier', RandomForestClassifier())
    ]),
    'decision_tree': Pipeline([
        ('vocab', CountVectorizer()),
        ('classifier', DecisionTreeClassifier())
    ]),
}

In [11]:
# param_grid = {
#     'clf__n_estimators': [50, 100, 150],
#     'clf__max_depth': [None, 10, 20],
#     'clf__min_samples_split': [2, 5, 10]
# }

param_grids = {
    'knn': [
        {
            'classifier__n_neighbors' : [i for i in range(3, 21, 2)], 
            'classifier__p' : [1, 2, 3]
        }
    ],
    'svc': [
        {
            'classifier__kernel' : ['rbf'], 
            'classifier__C' : [0.1, 0.01, 1, 10, 100]
        }, 
        {
            'classifier__kernel' : ['poly'], 
            'classifier__degree' : [2, 3, 4, 5], 
            'classifier__C' : [0.1, 0.01, 1, 10, 100]
        }, 
        {
            'classifier__kernel' : ['linear'], 
            'classifier__C' : [0.1, 0.01, 1, 10, 100]
        }
    ],
    'logistic_regression': [
        {
            'classifier__C': [0.1, 1, 10], 
            'classifier__penalty': ['l2']
        }, 
        {
            'classifier__C': [0.1, 1, 10], 
            'classifier__penalty': ['l1'], 
            'classifier__solver': ['liblinear']
        }, 
        {
            'classifier__C': [0.1, 1, 10], 
            'classifier__penalty': ['elasticnet'], 
            'classifier__l1_ratio': [0.4, 0.5, 0.6],
            'classifier__solver': ['saga']
        }
    ],
    'random_forest': [
        {
            'classifier__n_estimators': [50, 100, 200]
        }
    ],
    'decision_tree': [
        {
            'classifier__max_depth': [None, 5, 10]
        }
    ],
}

In [12]:
import warnings

warnings.filterwarnings('ignore')

In [13]:
# clf = GridSearchCV(
#     estimator=pipeline, 
#     param_grid=param_grid, 
#     scoring='accuracy',
#     cv=5,
#     return_train_score=True,
#     n_jobs=-1,
#     verbose=1 # shows total fits
# )

# # max_tuning_runs=None will make sure that all the runs are recorded.
# mlflow.sklearn.autolog(max_tuning_runs=None)

# with mlflow.start_run() as run:
#     %time clf.fit(X_train, y_train)

In [14]:
best_models = {}

# Run the Pipeline
for algo in pipelines.keys():
    print("*"*10, algo, "*"*10)
    grid_search = GridSearchCV(estimator=pipelines[algo], 
                               param_grid=param_grids[algo], 
                               cv=5, 
                               scoring='accuracy', 
                               return_train_score=True,
                               verbose=1
                              )
    
    mlflow.sklearn.autolog(max_tuning_runs=None)
    
    with mlflow.start_run() as run:
        %time grid_search.fit(X_train, y_train)
        
    print('Train Score: ', grid_search.best_score_)
    print('Test Score: ', grid_search.score(X_test, y_test))
    
    best_models[algo] = grid_search.best_estimator_
    print()

********** knn **********




Fitting 5 folds for each of 27 candidates, totalling 135 fits
CPU times: total: 6min 49s
Wall time: 4min 54s
Train Score:  0.9044615240416161
Test Score:  0.9049295774647887

********** svc **********




Fitting 5 folds for each of 30 candidates, totalling 150 fits
CPU times: total: 8min 4s
Wall time: 8min 32s
Train Score:  0.9184047024196215
Test Score:  0.9213615023474179

********** logistic_regression **********




Fitting 5 folds for each of 15 candidates, totalling 75 fits
CPU times: total: 3min 34s
Wall time: 4min
Train Score:  0.919871838380182
Test Score:  0.9248826291079812

********** random_forest **********




Fitting 5 folds for each of 3 candidates, totalling 15 fits
CPU times: total: 2min 7s
Wall time: 2min 23s
Train Score:  0.9029942803460018
Test Score:  0.9078638497652582

********** decision_tree **********




Fitting 5 folds for each of 3 candidates, totalling 15 fits
CPU times: total: 8.06 s
Wall time: 19.8 s
Train Score:  0.9113588299111293
Test Score:  0.9125586854460094



In [15]:
# Stop the auto logger

# mlflow.sklearn.autolog(disable=True)

In [16]:
# !pip install prefect
# prefect server start


In [100]:
import pandas as pd
import numpy as np
import re
import nltk
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from nltk.stem.porter import PorterStemmer
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.feature_extraction.text import CountVectorizer

from sklearn.neighbors import KNeighborsClassifier

from sklearn.pipeline import Pipeline
from sklearn import metrics


In [18]:
from prefect import task, flow

In [101]:
@task
def load_data(file_path):
    """
    Load data from a CSV file.
    """
    return pd.read_csv(file_path)

@task
def split_inputs_output(data, inputs, output):
    """
    Split features and target variables.
    """
    data = data.dropna(subset=[inputs])
    X = data[inputs]
    y = data[output]
    return X, y


@task
def split_train_test(X, y, test_size=0.2, random_state=42):
    """
    Split data into train and test sets.
    """
    return train_test_split(X, y, test_size=test_size, random_state=random_state)

@task
def preprocess_data(X_train, X_test, y_train, y_test):
    """
    Preprocess the data.
    """
    vocab = CountVectorizer()
    X_train_preprocess = vocab.fit_transform(X_train)
    X_test_preprocess = vocab.transform(X_test)
    return X_train_preprocess, X_test_preprocess, y_train, y_test


@task
def train_model(X_train_preprocess, y_train,hyperparameters):
    """
    Training the machine learning model.
    """
    clf = KNeighborsClassifier(metric='cosine', **hyperparameters)
    clf.fit(X_train_preprocess, y_train)
    return clf

@task
def evaluate_model(model, X_train_preprocess, y_train, X_test_preprocess, y_test):
    """
    Evaluating the model.
    """
    y_train_pred = model.predict(X_train_preprocess)
    y_test_pred = model.predict(X_test_preprocess)

    train_score = metrics.accuracy_score(y_train, y_train_pred)
    test_score = metrics.accuracy_score(y_test, y_test_pred)
    
    return train_score, test_score

In [102]:
# Workflow

@flow(name="KNN Training Flow")
def workflow():
    DATA_PATH = "temp.csv"
    INPUTS = 'clean_text_stem'
    OUTPUT = 'label'
    HYPERPARAMETERS = {
    'n_neighbors': 3,
    'p': 3
    }

    # Load data
    badmintion = load_data(DATA_PATH)

    # Identify Inputs and Output
    X, y = split_inputs_output(badmintion, INPUTS, OUTPUT)

    # Split data into train and test sets
    X_train, X_test, y_train, y_test = split_train_test(X, y)

    # Preprocess the data
    X_train_preprocess, X_test_preprocess, y_train, y_test = preprocess_data(X_train, X_test, y_train, y_test)

    # Build a model
    model = train_model(X_train_preprocess, y_train, HYPERPARAMETERS)
    
    # Evaluation
    train_score, test_score = evaluate_model(model, X_train_preprocess, y_train, X_test_preprocess, y_test)
    
    print("Train Score:", train_score)
    print("Test Score:", test_score)

In [103]:
# DATA_PATH = "temp.csv"
# badmintion = pd.read_csv(DATA_PATH)
# # badmintion.head()
# badmintion["clean_text_stem"].isna().sum()

In [104]:
if __name__ == "__main__":
    workflow()

2024/03/25 22:01:44 INFO mlflow.utils.autologging_utils: Created MLflow autologging run with ID '2f5cad21d4994423b435ff586f5ced27', which will track hyperparameters, performance metrics, model artifacts, and lineage information for the current sklearn workflow


Train Score: 0.939042303172738
Test Score: 0.9077555816686251


In [None]:
# if __name__ == "__main__":
#     workflow.deploy(
#         name="deployment1",
#         cron="0 12 * * *",
#     )