<a href="https://colab.research.google.com/github/An210/ML/blob/main/Text%20Classification%20Model%20Selection.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Ticket Assignment - KNN model training


In [6]:
!pip install nltk

import pandas as pd
import numpy as np
import nltk
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.neighbors import KNeighborsClassifier
from sklearn.svm import SVC
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score
from sklearn.feature_extraction.text import TfidfVectorizer
from nltk.corpus import wordnet

# Download required NLTK resources
nltk.download('wordnet')
nltk.download('punkt')
nltk.download('averaged_perceptron_tagger_eng')
nltk.download('punkt_tab')


def augment_text(text, num_aug=1):
    """Augments text using synonym replacement."""
    augmented_texts = [text]  # Start with original text
    for _ in range(num_aug):
        tokens = nltk.word_tokenize(text)
        pos_tags = nltk.pos_tag(tokens)

        new_tokens = []
        for token, pos_tag in pos_tags:
            synonyms = []
            for syn in wordnet.synsets(token):
                for lemma in syn.lemmas():
                    synonyms.append(lemma.name())

            if synonyms and pos_tag.startswith(('N', 'V', 'J', 'R')):  # Augment nouns, verbs, adjectives, adverbs
                new_token = np.random.choice(synonyms, 1)[0]
            else:
                new_token = token

            new_tokens.append(new_token)

        augmented_texts.append(" ".join(new_tokens))

    return augmented_texts

# Read the Excel file into a pandas DataFrame
df = pd.read_csv("/content/Train1.csv")

# Extract the text data and labels from the DataFrame
text_data = df["Data"].tolist()
labels = df["Label"].tolist()

# Data Augmentation
augmented_text_data = []
augmented_labels = []
for text, label in zip(text_data, labels):
    augmented_texts = augment_text(text, num_aug=5)  # Generate augmented texts per original text
    augmented_text_data.extend(augmented_texts)
    augmented_labels.extend([label] * len(augmented_texts))

# 1. TF-IDF Feature Extraction
vectorizer = TfidfVectorizer()
X = vectorizer.fit_transform(augmented_text_data)
X = np.nan_to_num(X.toarray())  # Replace NaN and infinite values with finite numbers

# 2. Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(
    X, augmented_labels, test_size=0.2, random_state=42
)

# 3. Define models and parameter grids
models = {
    "KNN": (KNeighborsClassifier(), {
        'n_neighbors': [3, 5, 7, 9],
        'weights': ['uniform', 'distance'],
        'metric': ['euclidean', 'manhattan', 'minkowski']
    }),
    "SVM": (SVC(), {
        'C': [0.1, 1, 10],
        'kernel': ['linear', 'rbf', 'poly'],
        'gamma': ['scale', 'auto']
    }),
    "DecisionTree": (DecisionTreeClassifier(), {
        'criterion': ['gini', 'entropy'],
        'max_depth': [None, 10, 20, 30]
    }),
    "RandomForest": (RandomForestClassifier(), {
        'n_estimators': [50, 100, 200],
        'criterion': ['gini', 'entropy'],
        'max_depth': [None, 10, 20]
    })
}

# 4. Loop through models and perform grid search
results = {}
for model_name, (model, param_grid) in models.items():
    print(f"Running GridSearchCV for {model_name}...")
    grid_search = GridSearchCV(
        estimator=model,
        param_grid=param_grid,
        scoring='accuracy',
        cv=5,
        verbose=1,
        n_jobs=-1
    )
    grid_search.fit(X_train, y_train)

    # Store the best results
    best_params = grid_search.best_params_
    best_score = grid_search.best_score_
    results[model_name] = {
        "Best Parameters": best_params,
        "Best Cross-Validation Score": best_score
    }

    # Train the best model and evaluate on test data
    best_model = grid_search.best_estimator_
    best_model.fit(X_train, y_train)
    y_pred = best_model.predict(X_test)
    test_accuracy = accuracy_score(y_test, y_pred)
    results[model_name]["Test Accuracy"] = test_accuracy

# 5. Print results for each model
for model_name, result in results.items():
    print(f"Results for {model_name}:")
    print(f"Best Parameters: {result['Best Parameters']}")
    print(f"Best Cross-Validation Score: {result['Best Cross-Validation Score']}")
    print(f"Test Accuracy: {result['Test Accuracy']}")
    print("-" * 40)




[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package averaged_perceptron_tagger_eng to
[nltk_data]     /root/nltk_data...
[nltk_data]   Unzipping taggers/averaged_perceptron_tagger_eng.zip.
[nltk_data] Downloading package punkt_tab to /root/nltk_data...
[nltk_data]   Package punkt_tab is already up-to-date!


Running GridSearchCV for KNN...
Fitting 5 folds for each of 24 candidates, totalling 120 fits


 0.53611111 0.86111111        nan 0.83611111        nan 0.83611111
        nan 0.76666667        nan 0.71944444 0.88611111 0.88611111
 0.78611111 0.86111111 0.73888889 0.90833333 0.53611111 0.86111111]


Running GridSearchCV for SVM...
Fitting 5 folds for each of 18 candidates, totalling 90 fits
Running GridSearchCV for DecisionTree...
Fitting 5 folds for each of 8 candidates, totalling 40 fits
Running GridSearchCV for RandomForest...
Fitting 5 folds for each of 18 candidates, totalling 90 fits
Results for KNN:
Best Parameters: {'metric': 'euclidean', 'n_neighbors': 7, 'weights': 'distance'}
Best Cross-Validation Score: 0.9083333333333332
Test Accuracy: 1.0
----------------------------------------
Results for SVM:
Best Parameters: {'C': 1, 'gamma': 'scale', 'kernel': 'rbf'}
Best Cross-Validation Score: 0.8861111111111111
Test Accuracy: 1.0
----------------------------------------
Results for DecisionTree:
Best Parameters: {'criterion': 'gini', 'max_depth': 10}
Best Cross-Validation Score: 0.836111111111111
Test Accuracy: 0.8181818181818182
----------------------------------------
Results for RandomForest:
Best Parameters: {'criterion': 'gini', 'max_depth': None, 'n_estimators': 50}
Bes

# Test Case

In [10]:
def predict_new_data(new_data, vectorizer, model):
    """Predicts labels for new data using the trained model.

    Args:
        new_data (list): A list of text strings representing the new data.
        vectorizer (TfidfVectorizer): The trained TF-IDF vectorizer.
        model (KNeighborsClassifier): The trained KNN model.

    Returns:
        list: The predicted labels for the new data.
    """
    # 1. Transform new data using the trained vectorizer
    new_data_tfidf = vectorizer.transform(new_data)

    # 2. Make predictions using the trained model
    predictions = model.predict(new_data_tfidf)

    return predictions


# Example usage:
new_data = [
    "This is a new server request",
    "Need help with Power BI",
    "Database issue",
]

print("Best model:", best_model)
predicted_labels = predict_new_data(new_data, vectorizer, best_model)

print("New data:", new_data)
print("Predicted labels:", predicted_labels)

Best model: RandomForestClassifier(n_estimators=50)
New data: ['This is a new server request', 'Need help with Power BI', 'Database issue']
Predicted labels: ['IT' 'D&A' 'D&A']
