# Scikit-Learn

# Load Dataset

In [4]:
import pandas as pd

In [5]:
df = pd.read_csv("data.csv", sep="\t", encoding="utf-8")

In [6]:
df

Unnamed: 0,sentence,target,source
0,So there is no way for me to plug it in here i...,0,amazon
1,"Good case, Excellent value.",1,amazon
2,Great for the jawbone.,1,amazon
3,Tied to charger for conversations lasting more...,0,amazon
4,The mic is great.,1,amazon
...,...,...,...
2743,I think food should have flavor and texture an...,0,yelp
2744,Appetite instantly gone.,0,yelp
2745,Overall I was not impressed and would not go b...,0,yelp
2746,"The whole experience was underwhelming, and I ...",0,yelp


## Split dataset

In [7]:
from sklearn.model_selection import train_test_split

In [8]:
X_train, X_test, y_train, y_test = train_test_split(df["sentence"], df["target"], test_size=0.2, random_state=123)

In [9]:
X_train.shape, y_train.shape

((2198,), (2198,))

In [10]:
X_test.shape, y_test.shape

((550,), (550,))

# Preprocessing

In [11]:
from sklearn.feature_extraction.text import CountVectorizer

In [12]:
count_vectorizer = CountVectorizer(min_df=1, lowercase=True)
count_vectorizer.fit(X_train)  # import use only the training!
X_train_encoded = count_vectorizer.transform(X_train)
X_test_encoded = count_vectorizer.transform(X_test)

In [13]:
X_train_encoded

<2198x4529 sparse matrix of type '<class 'numpy.int64'>'
	with 24039 stored elements in Compressed Sparse Row format>

In [14]:
X_test_encoded

<550x4529 sparse matrix of type '<class 'numpy.int64'>'
	with 5563 stored elements in Compressed Sparse Row format>

# MLFlow

In [15]:
import mlflow
from mlflow.data.pandas_dataset import PandasDataset

mlflow.set_tracking_uri(uri="http://127.0.0.1:8080")

In [16]:
dataset: PandasDataset = mlflow.data.from_pandas(df, source="data.csv")

  return _dataset_source_registry.resolve(


## Experiment 1 (Logistic Regression)

In [17]:
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score
from sklearn.model_selection import cross_val_score

In [18]:
experiment_name = "sentiment_analysis_logistic_regression"

experiment_tags = {
    "nlp.framework": "Scikit Learn",
    "nlp.encoding": "CountVectorizer",
    "nlp.model": "Logistic Regression",
    "nlp.task": "Sentiment Analysis"
}

mlflow.create_experiment(name=experiment_name, 
                         artifact_location="mlartifacts",
                         tags=experiment_tags)

'156473517357470349'

In [19]:
mlflow.set_experiment(experiment_name=experiment_name)  # It could be use the ID too

params_list = [
    {
        "penalty": "l2",
        "solver": "lbfgs",
        "max_iter": 2
    },
    {
        "penalty": "l2",
        "solver": "liblinear",
        "max_iter": 200
    }
]

for params in params_list:

    # Cross validation
    clf = LogisticRegression(**params, random_state=123)
    scores = cross_val_score(estimator=clf, X=X_train_encoded, y=y_train, cv=5)
    print("-"*20)
    print(params)
    print(scores)
    print("-"*20)

    # Training using all the data
    clf.fit(X_train_encoded, y_train)
    y_pred = clf.predict(X_test_encoded)
    
    accuracy = accuracy_score(y_true=y_test, y_pred=y_pred)
    precision = precision_score(y_true=y_test, y_pred=y_pred)
    recall = recall_score(y_true=y_test, y_pred=y_pred)
    f1 = f1_score(y_true=y_test, y_pred=y_pred)
    
    metrics = {
        "cv_score": scores.mean(),
        "accuracy": accuracy,
        "precision": precision,
        "recall": recall,
        "f1": f1
    }

    run_name = "_".join([f"{k}_{v}" for k, v in params.items()])
    
    with mlflow.start_run(run_name=run_name):
        # Log the hyperparameters
        mlflow.log_params(params)
    
        # Log the metrics
        mlflow.log_metrics(metrics)

        # Log the dataset
        mlflow.log_input(dataset, context="training")
    
        # Log the model
        mlflow.sklearn.log_model(sk_model=clf, 
                                 artifact_path="yelp_model", 
                                 input_example=X_train_encoded)
    

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver opt

--------------------
{'penalty': 'l2', 'solver': 'lbfgs', 'max_iter': 2}
[0.66818182 0.62954545 0.64318182 0.68792711 0.68109339]
--------------------


  return _infer_schema(self._df)


--------------------
{'penalty': 'l2', 'solver': 'liblinear', 'max_iter': 200}
[0.83181818 0.82954545 0.79772727 0.82687927 0.82232346]
--------------------




## Experiment 2 (Decision Tree)

In [20]:
from sklearn.tree import DecisionTreeClassifier

In [21]:
experiment_name = "sentiment_analysis_decision_tree"

experiment_tags = {
    "nlp.framework": "Scikit Learn",
    "nlp.encoding": "CountVectorizer",
    "nlp.model": "Decision Tree",
    "nlp.task": "Sentiment Analysis"
}

mlflow.create_experiment(name=experiment_name, 
                         tags=experiment_tags)

'827846084136165640'

In [23]:
# NOTE: THIS CAN BE TURN INTO A FUNCTION INSTEAD OF CODING REPEATED CODE
mlflow.set_experiment(experiment_name=experiment_name)  # It could be use the ID too

params_list = [
    {
        "criterion": "entropy",
        "splitter": "random",
    },
    {
        "criterion": "gini",
        "splitter": "best",
    }
]

for params in params_list:

    # Cross validation
    clf = DecisionTreeClassifier(**params, random_state=123)
    scores = cross_val_score(estimator=clf, X=X_train_encoded, y=y_train, cv=5)
    print("-"*20)
    print(params)
    print(scores)
    print("-"*20)

    # Training using all the data
    clf.fit(X_train_encoded, y_train)
    y_pred = clf.predict(X_test_encoded)
    
    accuracy = accuracy_score(y_true=y_test, y_pred=y_pred)
    precision = precision_score(y_true=y_test, y_pred=y_pred)
    recall = recall_score(y_true=y_test, y_pred=y_pred)
    f1 = f1_score(y_true=y_test, y_pred=y_pred)
    
    metrics = {
        "cv_score": scores.mean(),
        "accuracy": accuracy,
        "precision": precision,
        "recall": recall,
        "f1": f1
    }

    run_name = "_".join([f"{k}_{v}" for k, v in params.items()])
    
    with mlflow.start_run(run_name=run_name):
        # Log the hyperparameters
        mlflow.log_params(params)
    
        # Log the metrics
        mlflow.log_metrics(metrics)

        # Log the dataset
        mlflow.log_input(dataset, context="training")
    
        # Log the model
        mlflow.sklearn.log_model(sk_model=clf, 
                                 artifact_path="yelp_model", 
                                 input_example=X_train_encoded)
    

--------------------
{'criterion': 'entropy', 'splitter': 'random'}
[0.73181818 0.76818182 0.73409091 0.76309795 0.72437358]
--------------------




--------------------
{'criterion': 'gini', 'splitter': 'best'}
[0.75909091 0.74318182 0.73409091 0.74259681 0.76082005]
--------------------


