#### Read Train Data

In [95]:
import pandas as pd
import json

# Read the JSON data from the file
with open('train_data.json', 'r') as json_file:
    data = json.load(json_file)

# Create a list to store tag and patterns
tag_patterns_list = []

# Extract "tag" and "patterns" from each JSON object
for item in data:
    tag = item["tag"]
    patterns = item["patterns"]
    tag_patterns_list.append({"patterns": patterns[0] if patterns else "", "tag": tag})

# Create a DataFrame with "patterns" as the first column and "tag" as the second column
df = pd.DataFrame(tag_patterns_list)

# Display the DataFrame
df['patterns'][26]


"how can i initiate a return for a product that i purchased if i'm not satisfied"

In [96]:
# product_availability, return_initiation

#### Read Test Data

In [97]:
import pandas as pd

# Initialize an empty list to store tag and patterns
tag_patterns_list = []

# Read data from the test_data.txt file
with open('test_data.txt', 'r') as file:
    for line in file:
        line = line.strip()  # Remove leading/trailing whitespace
        tag, patterns = line.split(',', 1)  # Split into tag and patterns
        tag = tag.strip('"')  # Remove double quotes around tag
        tag_patterns_list.append({"patterns": patterns.strip('"'), "tag": tag})  # Swap tag and patterns

# Create a DataFrame for the test data with the desired order
test_df = pd.DataFrame(tag_patterns_list)

# Display the modified DataFrame for the test data
test_df

Unnamed: 0,patterns,tag
0,Is there a warranty for this product?,product_warranty
1,What is the return policy if I'm not satisfied...,product_return
2,Do you provide free shipping?,product_free_shipping
3,Are there any discount codes available?,product_discount
4,Are payments via Google Wallet accepted?,payment_google
...,...,...
62,When can I expect my order to arrive?,delivery_date
63,What is the estimated delivery date?,delivery_date
64,What are the features of the latest model of {...,last_model_features
65,Can you provide a list of the best-selling pro...,product_list


## Establish Random Forest classifier

In [104]:
import nltk
import string
import random
from nltk.util import ngrams
from nltk.tokenize import word_tokenize
from nltk import pos_tag
from sklearn.feature_extraction import DictVectorizer
from sklearn.ensemble import RandomForestClassifier

# Define a function to preprocess text into bigrams
def preprocess_text(text):
    # Tokenize data
    tokens = nltk.word_tokenize(text)

    # Lowercase all words
    tokens = [word.lower() for word in tokens]

    # Remove stopwords and punctuation
    stop_words = set(nltk.corpus.stopwords.words('english'))
    tokens = [word for word in tokens if word not in stop_words and word not in string.punctuation]

    # Generate bigrams
    bigrams_list = list(ngrams(tokens, 1))  # '1' n-gram size

    # Lemmatize words (optional)
    lemmatizer = nltk.stem.WordNetLemmatizer()
    tokens = [' '.join(bigram) for bigram in bigrams_list]

    return tokens

# Define a function to extract POS tags from text
def extract_pos_tags(text):
    tokens = word_tokenize(text)
    pos_tags = pos_tag(tokens)
    return [tag for _, tag in pos_tags]

# Define a function to preprocess a single text string and convert it to a feature dictionary
def preprocess_text_to_features(text, n=1):
    preprocessed_text = preprocess_text(text)
    feature_dict = {' '.join(bigram): True for bigram in preprocessed_text}
    
    # Extract POS tags and add them as features
    pos_tags = extract_pos_tags(text)
    for tag in pos_tags:
        feature_dict[tag] = True
    
    return feature_dict

# Prepare the data as labeled featuresets from the 'df' DataFrame
labeled_featuresets = []
for index, row in df.iterrows():
    tag = row['tag']
    pattern = row['patterns']
    feature_dict = preprocess_text_to_features(pattern)
    labeled_featuresets.append((feature_dict, tag))

# Shuffle the labeled featuresets to ensure randomness
random.shuffle(labeled_featuresets)

# Convert feature dictionaries to a feature matrix
vectorizer = DictVectorizer()
X_train = vectorizer.fit_transform([x for x, _ in labeled_featuresets])
y_train = [y for _, y in labeled_featuresets]

random.seed(42)

# Train a Random Forest classifier
classifier = RandomForestClassifier(n_estimators = 100, random_state = 42, min_samples_split = 4 )
classifier.fit(X_train, y_train)

# Define a function to get a tag for a user's question using the Random Forest classifier
def get_tag(question, classifier, df):
    # Preprocess the question with bigrams and convert it to a feature dictionary
    feature_dict = preprocess_text_to_features(question)
    
    # Extract POS tags and add them as features
    pos_tags = extract_pos_tags(question)
    for tag in pos_tags:
        feature_dict[tag] = True
    
    # Convert the feature dictionary to a feature vector
    feature_vector = vectorizer.transform([feature_dict])
    
    # Use the classifier to predict the tag
    predicted_tag = classifier.predict(feature_vector)[0]
    
    return predicted_tag

# Example usage
x = "The images on your page aren't loading properly."
predicted_tag = get_tag(x, classifier, df)
print("Predicted Tag:", predicted_tag)


Predicted Tag: image_loading_troubleshooting


## Finding Accuracy

In [105]:
# Prepare the test data as labeled featuresets from the 'test_df' DataFrame
test_labeled_featuresets = []
for index, row in test_df.iterrows():
    tag = row['tag']
    pattern = row['patterns']
    feature_dict = preprocess_text_to_features(pattern)
    test_labeled_featuresets.append((feature_dict, tag))

# Convert feature dictionaries to a feature matrix for the test data
X_test = vectorizer.transform([x for x, _ in test_labeled_featuresets])
y_test = [y for _, y in test_labeled_featuresets]

# Use the trained classifier to make predictions on the test data
y_pred = classifier.predict(X_test)

# Calculate accuracy
from sklearn.metrics import accuracy_score

accuracy = accuracy_score(y_test, y_pred)
print(f"Accuracy: {accuracy*100} %")


Accuracy: 89.55223880597015 %


## Grid search on hypyerparameters 


In [None]:
from sklearn.model_selection import GridSearchCV
from sklearn.model_selection import LeaveOneOut


# Define the hyperparameter grid to search over
param_grid = {
    'n_estimators': [100, 200, 300],  # Number of trees in the forest
    'max_depth': [None, 10, 20, 30],  # Maximum depth of the trees
    'min_samples_split': [2,3,4,5,10],  # Minimum number of samples required to split an internal node
    'min_samples_leaf': [1, 2, 4],  # Minimum number of samples required to be at a leaf node
}

# Create a Random Forest classifier
rf_classifier = RandomForestClassifier(random_state=42)

# Create LOOCV object
loo_cv = LeaveOneOut()

# Perform grid search with LOOCV
grid_search = GridSearchCV(estimator=rf_classifier, param_grid=param_grid, cv=loo_cv, scoring='accuracy')

# Perform the grid search on your training data
grid_search.fit(X_train, y_train)

# Print the best hyperparameters found by the grid search
print("Best Hyperparameters:", grid_search.best_params_)


Best Hyperparameters: 'max_depth': None, 'min_samples_leaf': 1, 'min_samples_split': 4, 'n_estimators': 100

## Example of the next working with Data base

In [107]:
import spacy
import sqlite3

# Load the spaCy model with NER
nlp = spacy.load("en_core_web_sm")

# Simliar To database:
# Function to retrieve product information from the database
def get_product_info(product_name):
    # Your database retrieval code here (as shown in the previous response)
    product_names = ["iPhone 11", "Samsung Galaxy S21", "Google Pixel 5"]
    available_colors = ["Red, Green, Yellow", "Pink, White, Gray", "Black, White, Green"]

    for index in range(len(product_names)):
        if product_names[index] == product_name:
            return {"available_colors":available_colors[index]}
    pass 

# Function to extract product names from user input using spaCy NER
def extract_product_name(user_input):
    doc = nlp(user_input)
    
    # Initialize a list to store extracted product names
    product_names = []
    
    # Extract entities recognized as ORGANIZATION (which can represent product names)
    for ent in doc.ents:
        # another maybe used in another situations : if ent.label_ in ["PERSON", "ORG", "LOC", "DATE", "TIME", "MONEY"]
        if ent.label_ == "ORG":
            product_names.append(ent.text)
    return product_names

# User input
user_input = "What are the available colors of iPhone 11 and Samsung Galaxy S21?"

predicted_tag = get_tag(user_input, classifier, df)

if predicted_tag == "product_availability":
    # Extract product names from user input
    product_names = extract_product_name(user_input)

    # Initialize a dictionary to store product information
    product_info_dict = {}

    # Retrieve product information from the database for each product name
    for product_name in product_names:
        product_info = get_product_info(product_name)
        if product_info:
            product_info_dict[product_name] = product_info

    # Generate responses based on the retrieved product information
    responses = []
    for product_name, product_info in product_info_dict.items():
        if 'available_colors' in product_info:
            response = f"The available colors for {product_name} are {product_info['available_colors']}."
        else:
            response = f"Information about colors for {product_name} is not available."
        responses.append(response)

    if responses:
        final_response = "\n".join(responses)
    else:
        final_response = "Sorry, I couldn't find information about the specified products."

    # Print the response
    print("User Input:", user_input)
    print("Response:", final_response)


User Input: What are the available colors of iPhone 11 and Samsung Galaxy S21?
Response: The available colors for iPhone 11 are Red, Green, Yellow.
The available colors for Samsung Galaxy S21 are Pink, White, Gray.
