In [9]:
from sklearn.pipeline import Pipeline
from sklearn.model_selection import GridSearchCV, train_test_split
from sklearn.metrics import make_scorer, precision_recall_fscore_support
from sklearn.multiclass import OneVsRestClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.naive_bayes import MultinomialNB, GaussianNB
from sklearn.svm import SVC
from sklearn.tree import DecisionTreeClassifier
from sklearn.neighbors import KNeighborsClassifier
import mlflow
import mlflow.sklearn

In [10]:
import nltk
from nltk.corpus import stopwords
nltk.download('stopwords')
from nltk.stem import PorterStemmer
from nltk.stem import WordNetLemmatizer
nltk.download('wordnet')
from nltk.tokenize import word_tokenize
from nltk.tokenize import sent_tokenize
nltk.download('punkt')

[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\Abhi\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\Abhi\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\Abhi\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!


True

In [11]:
import pandas as pd

df = pd.read_csv("D:/Projects/News-Data-Classification-Application/dataset/PreprocessedData_The_indian_Times.csv") #data from theindianexpress

In [12]:
df_cl = df.dropna(subset=['Title', 'Article_body']).reset_index(drop=True)

In [13]:
df_cleaned = df_cl.drop_duplicates(keep='first')

In [14]:
target_category = df_cleaned['section_value'].unique()
print(target_category)

df_cleaned['section_valueId'] = df_cleaned['section_value'].factorize()[0]
df_cleaned

['Entertainment' 'Sports' 'World' 'Business']


Unnamed: 0.1,Unnamed: 0,Title,Article_body,section_value,full_article,SectionEncoded,section_valueId
0,0,Salaar box office collection Day 13: Prabhas’ ...,The Prabhas-starrer Salaar has been playing in...,Entertainment,salaar box office collection day prabhas block...,1,0
1,1,Khushi Kapoor says father Boney Kapoor cried a...,Sisters Janhvi Kapoor and Khushi Kapoor appear...,Entertainment,khushi kapoor say father boney kapoor cry watc...,1,0
2,2,Janhvi Kapoor recalls ‘howling’ when she got t...,When veteran actor Sridevi passed away in Febr...,Entertainment,janhvi kapoor recall howl get call sridevis de...,1,0
3,3,Deepika Padukone says she’s looking forward to...,Deepika Padukone has always said that she and ...,Entertainment,deepika padukone say shes look forward childre...,1,0
4,4,Janhvi Kapoor says boyfriend Shikhar Pahariya ...,Janhvi Kapoor and her sister Khushi Kapoor app...,Entertainment,janhvi kapoor say boyfriend shikhar pahariya u...,1,0
...,...,...,...,...,...,...,...
123,153,Watch: Japan’s Nagaoka launches world’s larges...,As the clock struck midnight to welcome the Ne...,Business,watch japan nagaoka launch worlds largest fire...,0,3
124,154,"India vs South Africa, Cape Town Test: Lessons...",What’s the best thing to have happened to Indi...,Business,india vs south africa cape town test lessons w...,0,3
125,155,Sports maharajas and their fiefs,There’s a true story about a Delhi cricket adm...,Business,sport maharajas fiefs theres true story delhi ...,0,3
126,156,India’s way forward: Services or manufacturing?,India is awash with discussions about where it...,Business,indias way forward service manufacture india a...,0,3


In [15]:
category = df_cleaned[['section_value', 'section_valueId']].drop_duplicates().sort_values('section_valueId')


In [16]:
#Remove All Tags
import re

def remove_tags(text):
    remove = re.compile(r'')
    return re.sub(remove, '', text)
df_cleaned.loc[:, 'full_article'] = df_cleaned['full_article'].apply(remove_tags)

In [17]:
#Remove Special Characters
def special_char(text):
    reviews = ''
    for x in text:
        if x.isalnum():
            reviews = reviews + x
        else:
            reviews = reviews + ' '
    return reviews
df_cleaned.loc[:, 'full_article'] = df_cleaned['full_article'].apply(special_char)

In [18]:
#Convert Everything in Lower Case

def convert_lower(text):
    return text.lower()

df_cleaned.loc[:, 'full_article'] = df_cleaned['full_article'].apply(convert_lower)
df_cleaned['full_article'][1]

'khushi kapoor say father boney kapoor cry watch archies janhvi kapoor list sister common sridevi bias aside sisters janhvi kapoor khushi kapoor appear guests latest episode koffee karan speak candidly journey film business khushi also speak familys reaction debut moviekhushi kapoor younger daughter boney kapoor sridevi make debut zoya akhtars archies film release last month netflix earn mix review episode karan ask khushi audition film come back new york intention start point go audition fully shake first office go first audition ever give scar zoya make really easy though calm freak think go well khushi say indian express entertainment whatsapp channel follow us latest news interview review photos land covet part betty cooper film khushi say immediately start howl cry know want longest time moment felt okay happen zoya ideal situation get really emotional debutante reveal boney kapoor get emotional watch archies start cry janhvi add cry three days khushi say would get random message 

In [19]:
#Remove all Stopwords

def remove_stopwords(text):
    stop_words = set(stopwords.words('english'))
    words = word_tokenize(text)
    return [x for x in words if x not in stop_words]
df_cleaned.loc[:, 'full_article'] = df_cleaned['full_article'].apply(remove_stopwords)
df_cleaned['full_article'][1]

['khushi',
 'kapoor',
 'say',
 'father',
 'boney',
 'kapoor',
 'cry',
 'watch',
 'archies',
 'janhvi',
 'kapoor',
 'list',
 'sister',
 'common',
 'sridevi',
 'bias',
 'aside',
 'sisters',
 'janhvi',
 'kapoor',
 'khushi',
 'kapoor',
 'appear',
 'guests',
 'latest',
 'episode',
 'koffee',
 'karan',
 'speak',
 'candidly',
 'journey',
 'film',
 'business',
 'khushi',
 'also',
 'speak',
 'familys',
 'reaction',
 'debut',
 'moviekhushi',
 'kapoor',
 'younger',
 'daughter',
 'boney',
 'kapoor',
 'sridevi',
 'make',
 'debut',
 'zoya',
 'akhtars',
 'archies',
 'film',
 'release',
 'last',
 'month',
 'netflix',
 'earn',
 'mix',
 'review',
 'episode',
 'karan',
 'ask',
 'khushi',
 'audition',
 'film',
 'come',
 'back',
 'new',
 'york',
 'intention',
 'start',
 'point',
 'go',
 'audition',
 'fully',
 'shake',
 'first',
 'office',
 'go',
 'first',
 'audition',
 'ever',
 'give',
 'scar',
 'zoya',
 'make',
 'really',
 'easy',
 'though',
 'calm',
 'freak',
 'think',
 'go',
 'well',
 'khushi',
 'say',


In [20]:
#Lemmatizing the Words

def lemmatize_word(text):
    wordnet = WordNetLemmatizer()
    return " ".join([wordnet.lemmatize(word) for word in text])
df_cleaned.loc[:, 'full_article'] = df_cleaned['full_article'].apply(lemmatize_word)
df_cleaned['full_article'][1]

'khushi kapoor say father boney kapoor cry watch archies janhvi kapoor list sister common sridevi bias aside sister janhvi kapoor khushi kapoor appear guest latest episode koffee karan speak candidly journey film business khushi also speak family reaction debut moviekhushi kapoor younger daughter boney kapoor sridevi make debut zoya akhtars archies film release last month netflix earn mix review episode karan ask khushi audition film come back new york intention start point go audition fully shake first office go first audition ever give scar zoya make really easy though calm freak think go well khushi say indian express entertainment whatsapp channel follow u latest news interview review photo land covet part betty cooper film khushi say immediately start howl cry know want longest time moment felt okay happen zoya ideal situation get really emotional debutante reveal boney kapoor get emotional watch archies start cry janhvi add cry three day khushi say would get random message say go

In [21]:
#Declared Dependent and Independent Value
x = df_cleaned['full_article']
y = df_cleaned['section_valueId']

In [23]:
import numpy as np

In [24]:
from sklearn.feature_extraction.text import CountVectorizer
x = np.array(df_cleaned.loc[:, 'full_article'].values)
y = np.array(df_cleaned.section_valueId.values)
cv = CountVectorizer(max_features = 5000)
x = cv.fit_transform(df_cleaned.full_article).toarray()
print("X.shape = ",x.shape)
print("y.shape = ",y.shape)

X.shape =  (128, 5000)
y.shape =  (128,)


In [25]:
from sklearn.model_selection import train_test_split

# Assuming y is your target variable (categories)
x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=0.3, random_state=50, shuffle=True, stratify=y)

# Check the distribution in the training set
unique_classes, counts = np.unique(y_train, return_counts=True)
class_distribution = dict(zip(unique_classes, counts))

print("Class distribution in training set:")
print(class_distribution)

Class distribution in training set:
{0: 21, 1: 23, 2: 24, 3: 21}


In [38]:

# Set the local MLflow tracking URI
mlflow.set_tracking_uri(uri="http://127.0.0.1:5000")

# Create list to store model and accuracy information
perform_list = []

# Define a function for creating and running the pipeline
def create_and_run_pipeline(model, param_grid, x_train, y_train, x_test, y_test):
    pipeline = Pipeline([
        ('model', OneVsRestClassifier(model))
    ])

    # Set the experiment name for the run
    with mlflow.start_run(run_name='model-name'):

        grid_search = GridSearchCV(pipeline, param_grid, scoring='accuracy', cv=5)
        grid_search.fit(x_train, y_train)

        # Log parameters and metrics to MLflow
        mlflow.log_param("model_name", model.__class__.__name__)
        mlflow.log_params(grid_search.best_params_)

        # Log the sklearn model
        mlflow.sklearn.log_model(grid_search.best_estimator_, f"{model.__class__.__name__}_model")

        # Predict on the test set
        y_pred = grid_search.predict(x_test)

        # Performance metrics
        accuracy = round(accuracy_score(y_test, y_pred) * 100, 2)
        precision, recall, f1score, support = precision_recall_fscore_support(y_test, y_pred, average='micro')

        # Log metrics to MLflow
        mlflow.log_metric("test_accuracy", accuracy)
        mlflow.log_metric("precision", precision)
        mlflow.log_metric("recall", recall)
        mlflow.log_metric("f1score", f1score)

        # Add model and accuracy information to the list
        perform_list.append({
            'Model': model.__class__.__name__,
            'Test Accuracy': round(accuracy, 2),
            'Precision': round(precision, 2),
            'Recall': round(recall, 2),
            'F1': round(f1score, 2)
        })

        print(f'Test Accuracy Score of {model.__class__.__name__} with Hyperparameter Tuning: % {accuracy}')
        print(f'Precision: {precision}')
        print(f'Recall: {recall}')
        print(f'F1-score: {f1score}')

In [39]:
# Hyperparameter grids for each model
logreg_param_grid = {'model__estimator__C': [0.001, 0.01, 0.1, 1, 10, 100]}
rf_param_grid = {'model__estimator__n_estimators': [50, 100, 150],
                 'model__estimator__max_depth': [None, 10, 20, 30],
                 'model__estimator__min_samples_split': [2, 5, 10],
                 'model__estimator__min_samples_leaf': [1, 2, 4]}
nb_param_grid = {'model__estimator__alpha': [0.001, 0.01, 0.1, 1, 10, 100]}
svm_param_grid = {'model__estimator__C': [0.001, 0.01, 0.1, 1, 10, 100],
                  'model__estimator__kernel': ['linear', 'poly', 'rbf', 'sigmoid']}
dt_param_grid = {'model__estimator__max_depth': [None, 10, 20, 30],
                 'model__estimator__min_samples_split': [2, 5, 10],
                 'model__estimator__min_samples_leaf': [1, 2, 4]}
knn_param_grid = {'model__estimator__n_neighbors': [5, 10, 15],
                  'model__estimator__metric': ['minkowski'],
                  'model__estimator__p': [1, 2, 4]}

# ...

# Iterate over models and hyperparameter grids
for model, param_grid in zip(models, [logreg_param_grid, rf_param_grid, nb_param_grid, svm_param_grid, dt_param_grid, knn_param_grid, {}]):
    create_and_run_pipeline(model, param_grid, x_train, y_train, x_test, y_test)

print(perform_list)

TypeError: start_run() got an unexpected keyword argument 'experiment_name'