In [2]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import re,string,unicodedata
import nltk
from wordcloud import WordCloud
from nltk import ngrams
from collections import Counter
import warnings
warnings.filterwarnings('ignore')
import spacy
nlp = spacy.load("en_core_web_sm")
from sklearn.naive_bayes import MultinomialNB

from collections import Counter
from sklearn.metrics import classification_report
from sklearn.multioutput import MultiOutputClassifier
from sklearn.multiclass import OneVsRestClassifier, OneVsOneClassifier
from sklearn.metrics import accuracy_score ,roc_auc_score,f1_score,confusion_matrix
from sklearn.model_selection import train_test_split
from sklearn.pipeline import Pipeline
nltk.download('stopwords')
from nltk.corpus import stopwords
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.naive_bayes import GaussianNB,MultinomialNB
from sklearn.preprocessing import FunctionTransformer
from sklearn.ensemble import GradientBoostingClassifier
import mlflow
import mlflow.sklearn

df_train=pd.read_csv("train.csv",encoding='latin-1')
df_train=df_train[['text','sentiment']]
df_train['text'] = df_train['text'].astype(str)
df_train=df_train[df_train['sentiment']!='neutral']
df_train['sentiment']=df_train['sentiment'].map({'positive':1,'negative':0})
X=df_train['text']
y=df_train['sentiment']

class TextCleaner():
    def fit(self, X, y=None):
        return self

    def transform(self, X):
        return [self.clean_text(text) for text in X]
    def clean_text(self, text):
        text = str(text).lower()  # Make text lowercase
        text = re.sub('\[.*?\]', '', text)  # Remove any sequence of characters in square brackets
        text = re.sub('https?://\S+|www\.\S+', '', text)  # Remove links
        text = re.sub('<.*?>+', '', text)  # Remove HTML tags
        text = re.sub('[%s]' % re.escape(string.punctuation), '', text)  # Remove punctuation
        text = re.sub('\n', '', text)  # Remove newline characters
        text = re.sub('\w*\d\w*', '', text)  # Remove words containing numbers
        text = re.sub(r'[^a-z/A-Z/0-9/ ]', '', text)  # Remove special characters
        return text

class stopwords():
    
    def fit(self,X,y=None):
        return self
    
    def transform(self, X):
        return [self.stopw(text) for text in X]
    
    def stopw(self,text):
        from nltk.corpus import stopwords
        stopwords = stopwords.words('english')
        stopwords=stopwords+['s','m','u','im','ye','id','atg','na','ta','gon','wan']
        text= ' '.join([x for x in text.split() if x not in stopwords])
        return text

class lemma():
    
    def __init__(self,lemma_model):
        self.lemma_model=lemma_model
        
    def fit(self,X,y=None):
        return self
    
    def transform(self,X):
        return [self.lemmatise(text) for text in X]
    
    def lemmatise(self,text):
        return " ".join([token.lemma_ for token in self.lemma_model(text)])

def to_array(X):
    return X.toarray()
    
pipeline = Pipeline([
    ('cleaner', TextCleaner()),
    ('stop_words_removal',stopwords()),
    ('lemmatization',lemma(spacy.load("en_core_web_sm", disable = ['parser','ner']))),
    ('TF-IDF-fit',TfidfVectorizer()),
    ('model',MultinomialNB())
])
# Set the tracking URI to the local tracking server
mlflow.set_tracking_uri('http://127.0.0.1:5000')
experiment_name = "mlflow_pipeline_experiment"
experiment = mlflow.get_experiment_by_name(experiment_name)

if experiment is None:
    mlflow.create_experiment(experiment_name)
else:
    mlflow.set_experiment(experiment_name)
    
mlflow.set_experiment(experiment_name)



from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
pipeline = Pipeline([
    ('cleaner', TextCleaner()),
    ('stop_words_removal', stopwords()),
    ('lemmatization', lemma(spacy.load("en_core_web_sm", disable=['parser', 'ner']))),
    ('TF-IDF', TfidfVectorizer()),
    ('model', MultinomialNB())
])

# Fit and transform TF-IDF on the training set
X_train_tfidf = pipeline.named_steps['TF-IDF'].fit_transform(X_train)

# Transform TF-IDF on the test set
X_test_tfidf = pipeline.named_steps['TF-IDF'].transform(X_test)

# Fit the rest of the pipeline on the training set
pipeline.fit(X_train, y_train)

y_pred_train = pipeline.predict(X_train)

# Predict on the test set
y_pred = pipeline.predict(X_test)

accuracy_train = accuracy_score(y_train, y_pred_train)
accuracy_test = accuracy_score(y_test, y_pred)

f1_train = f1_score(y_train, y_pred_train)
f1_test = f1_score(y_test, y_pred)

    
with mlflow.start_run(run_name="Experiment"):
    mlflow.log_param("model_type", 'multinomial')
    mlflow.sklearn.log_model(pipeline, "model")
    mlflow.log_metric("accuracy train", accuracy_train)
    mlflow.log_metric("accuracy test", accuracy_test)
    mlflow.log_metric("f1 train", f1_train)
    mlflow.log_metric("f1 test", f1_test)

[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\Praveen\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
The git executable must be specified in one of the following ways:
    - be included in your $PATH
    - be set via $GIT_PYTHON_GIT_EXECUTABLE
    - explicitly set via git.refresh()

All git commands will error until this is rectified.

$GIT_PYTHON_REFRESH environment variable. Use one of the following values:
    - error|e|raise|r|2: for a raised exception

Example:
    export GIT_PYTHON_REFRESH=quiet

