<a href="https://colab.research.google.com/github/An210/ML/blob/main/Text%20Classification%20KNN.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Ticket Assignment - KNN model training


In [2]:
!pip install nltk

import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split, cross_val_score, GridSearchCV
from sklearn.neighbors import KNeighborsClassifier
from sklearn.metrics import accuracy_score
from sklearn.feature_extraction.text import TfidfVectorizer
import nltk
from nltk.corpus import wordnet

# Download required NLTK resources
nltk.download('wordnet')
nltk.download('punkt')
nltk.download('averaged_perceptron_tagger')
# The missing download:
nltk.download('averaged_perceptron_tagger_eng') # This line downloads the necessary data package
nltk.download('punkt_tab') # This line is added to download the required data package

def augment_text(text, num_aug=1):
    """Augments text using synonym replacement."""
    augmented_texts = [text]  # Start with original text
    for _ in range(num_aug):
        tokens = nltk.word_tokenize(text)
        pos_tags = nltk.pos_tag(tokens)

        new_tokens = []
        for token, pos_tag in pos_tags:
            synonyms = []
            for syn in wordnet.synsets(token):
                for lemma in syn.lemmas():
                    synonyms.append(lemma.name())

            if synonyms and pos_tag.startswith(('N', 'V', 'J', 'R')):  # Augment nouns, verbs, adjectives, adverbs
                new_token = np.random.choice(synonyms, 1)[0]
            else:
                new_token = token

            new_tokens.append(new_token)

        augmented_texts.append(" ".join(new_tokens))

    return augmented_texts

# Read the Excel file into a pandas DataFrame
df = pd.read_csv("/content/Train.csv")

# Extract the text data and labels from the DataFrame
text_data = df["Data"].tolist()
labels = df["Label"].tolist()

# Data Augmentation
augmented_text_data = []
augmented_labels = []
for text, label in zip(text_data, labels):
    augmented_texts = augment_text(text, num_aug=5)  # Generate 2 augmented texts per original text
    augmented_text_data.extend(augmented_texts)
    augmented_labels.extend([label] * len(augmented_texts))

# 1. TF-IDF Feature Extraction
vectorizer = TfidfVectorizer()
X = vectorizer.fit_transform(augmented_text_data)
X = X.toarray()

X = np.nan_to_num(X)  # Replace NaN and infinite values with finite numbers

# 2. Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(
    X, augmented_labels, test_size=0.2, random_state=42
)

# 3. Perform GridSearchCV for KNN Hyperparameter Tuning
param_grid = {
    'n_neighbors': [3, 5, 7, 9],  # Different values for k
    'weights': ['uniform', 'distance'],  # Weighting schemes
    'metric': ['euclidean', 'manhattan', 'minkowski']  # Distance metrics
}

grid_search = GridSearchCV(
    estimator=KNeighborsClassifier(),
    param_grid=param_grid,
    scoring='accuracy',
    cv=5,  # 5-fold cross-validation
    verbose=1,
    n_jobs=-1
)

# Fit GridSearchCV
grid_search.fit(X_train, y_train)

# Best parameters from GridSearch
print("Best Parameters:", grid_search.best_params_)
print("Best Cross-Validation Score:", grid_search.best_score_)

# 4. Train the optimized KNN model
best_knn_classifier = grid_search.best_estimator_
best_knn_classifier.fit(X_train, y_train)

# Evaluate the model
y_pred = best_knn_classifier.predict(X_test)
accuracy = accuracy_score(y_test, y_pred)
print("Test Set Accuracy:", accuracy)

for i in range(len(X_test)):
    print("Test data point:", X_test[i])
    print("True label:", y_test[i])
    print("Predicted label:", y_pred[i])
    print()




[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     /root/nltk_data...
[nltk_data]   Package averaged_perceptron_tagger is already up-to-
[nltk_data]       date!
[nltk_data] Downloading package averaged_perceptron_tagger_eng to
[nltk_data]     /root/nltk_data...
[nltk_data]   Package averaged_perceptron_tagger_eng is already up-to-
[nltk_data]       date!
[nltk_data] Downloading package punkt_tab to /root/nltk_data...
[nltk_data]   Package punkt_tab is already up-to-date!


Fitting 5 folds for each of 24 candidates, totalling 120 fits
Best Parameters: {'metric': 'euclidean', 'n_neighbors': 3, 'weights': 'uniform'}
Best Cross-Validation Score: 0.9428571428571428
Test Set Accuracy: 0.8888888888888888
Test data point: [0.         0.         0.         0.61245283 0.         0.
 0.         0.         0.         0.         0.         0.
 0.         0.         0.         0.         0.         0.
 0.         0.         0.         0.         0.79050714 0.
 0.         0.         0.         0.         0.         0.
 0.         0.         0.         0.         0.         0.        ]
True label: IT
Predicted label: D&A

Test data point: [0. 0. 0. 1. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0.
 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0.]
True label: D&A
Predicted label: D&A

Test data point: [0.         0.         0.         0.         0.         0.
 0.79284209 0.         0.         0.         0.         0.
 0.         0.         0.         0.         0.      

 0.67142857 0.91428571        nan 0.91428571        nan 0.82857143
        nan 0.82857143        nan 0.85238095 0.94285714 0.94285714
 0.7952381  0.91428571 0.7952381  0.91428571 0.67142857 0.91428571]


# Test Case

In [3]:
def predict_new_data(new_data, vectorizer, model):
    """Predicts labels for new data using the trained model.

    Args:
        new_data (list): A list of text strings representing the new data.
        vectorizer (TfidfVectorizer): The trained TF-IDF vectorizer.
        model (KNeighborsClassifier): The trained KNN model.

    Returns:
        list: The predicted labels for the new data.
    """
    # 1. Transform new data using the trained vectorizer
    new_data_tfidf = vectorizer.transform(new_data)

    # 2. Make predictions using the trained model
    predictions = model.predict(new_data_tfidf)

    return predictions


# Example usage:
new_data = [
    "This is a new server request",
    "Need help with Power BI",
    "Database issue",
]

predicted_labels = predict_new_data(new_data, vectorizer, best_knn_classifier)

print("New data:", new_data)
print("Predicted labels:", predicted_labels)

New data: ['This is a new server request', 'Need help with Power BI', 'Database issue']
Predicted labels: ['IT' 'D&A' 'IT']
