# After understanding that MultinomialNB takes in input array-like matrices

In [16]:
import pandas as pd
import numpy as np
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.naive_bayes import MultinomialNB
from sklearn.impute import SimpleImputer

from utilities import *

In [17]:
def preprocess(df):
    # Extract unique countries in the df
    unique_values = set()
    df['countries_in_family'].apply(lambda x: unique_values.update(x.strip("[]").replace("'", "").split())) 

    # Create new columns for each unique value
    for value in unique_values:
        # each country has a column (1 if the patent belongs to the country 0 otherwise)
        df[value] = df['countries_in_family'].apply(lambda x: 1 if value in x else 0)

    df = df[df.abstract.notna()].copy()  # drop all samples without abstract

    # Encode company names
    df['company_name_encoded'] = df.company_name.astype('category').cat.codes  # encode companies

    # Remove non-numeric columns
    df_columns_dropped = df.drop(['publication_number', 'company_name', 'countries_in_family', 'publn_nr','primary_cpc'], axis=1)

    # f0_ has the same value as commercialization, the other two shouldn't be used
    df_columns_dropped = df_columns_dropped.drop(['f0_', 'centrality', 'similarity'], axis=1)

    # Remove text as I can't compute min and max on it
    text = df_columns_dropped[['abstract', 'description_text']]  # putting them aside for later
    df_columns_dropped.drop(['abstract', 'description_text'], axis=1, inplace=True)

    df_no_missing = df_columns_dropped.fillna(df_columns_dropped.mean()).copy()

    # Extracting what we'll try to predict
    y = df_no_missing['commercialized']
    df_no_missing.drop('commercialized', axis=1, inplace=True)

    # Dropping columns where all the values are the same (min = max)
    min_eq_max = df_no_missing.columns[df_no_missing.min() == df_no_missing.max()].to_list()
    df_clean = df_no_missing.drop(min_eq_max, axis=1)

    X_train, X_test, y_train, y_test = train_test_split(df_clean, y, test_size=0.20, random_state=42)

    # Rescale
    scaler = StandardScaler()
    scaler.fit(X_train)
    X_train_scaled = scaler.transform(X_train)
    X_test_scaled = scaler.transform(X_test)

    # # Putting text back in
    # df_clean[['abstract', 'description_text']] = text 

    X_train, X_test, y_train, y_test = train_test_split(df_clean, y, test_size=0.20, random_state=42) 

    # # Same vectorizer applied to training and testing for abstract
    # vectorizer = TfidfVectorizer(max_features=1000)  # Adjust 'max_features' as needed
    # X_train_ab = encode_text_colum(X_train, 'abstract', vectorizer)
    # X_test_ab = encode_text_colum(X_test, 'abstract', vectorizer)

    # # Same vectorizer applied to training and testing for description_text
    # vectorizer = TfidfVectorizer(max_features=1000)  # Adjust 'max_features' as needed
    # X_train_de = encode_text_colum(X_train_ab, 'description_text', vectorizer)
    # X_test_de = encode_text_colum(X_test_ab, 'description_text', vectorizer)

    # return X_train_de, X_test_de, y_train, y_test
    return X_train, X_test, y_train, y_test


In [18]:
df = pd.read_csv('data/modelready_220423.csv')

In [19]:
from sklearn.preprocessing import MinMaxScaler

# Apply the preprocessing and encoding function
X_train, X_test, y_train, y_test = preprocess(df)


In [20]:
print(f"There aren't any NaN values in X_train_de:\n>> {X_train.isna().sum().sum() == 0}")

There aren't any NaN values in X_train_de:
>> True


In [21]:
X_train_array = X_train.values
X_test_array = X_test.values

In [22]:
scaler = MinMaxScaler()
scaler.fit(X_train)
X_train_array = scaler.transform(X_train_array)
X_test_array = scaler.transform(X_test_array)



In [23]:
import numpy as np
from sklearn.naive_bayes import MultinomialNB

def train_naive_bayes_model(X_train, y_train):
    # Create and fit the Naive Bayes model
    nb_classifier = MultinomialNB()
    nb_classifier.fit(X_train, y_train)

    return nb_classifier

def evaluate_model(nb_classifier, X_test, y_test):
    # Make predictions using the trained model
    y_pred = nb_classifier.predict(X_test)

    # Calculate the accuracy
    accuracy = np.mean(y_pred == y_test)
    print("Accuracy:", accuracy)

# Load the preprocessed data
X_train = X_train_array
X_test = X_test_array

# Train the Naive Bayes model
nb_classifier = train_naive_bayes_model(X_train, y_train)

# Evaluate the model performance
evaluate_model(nb_classifier, X_test, y_test)


Accuracy: 0.8186311077955987
