# Part 1
(This is the working notebook)

In [35]:
import pandas as pd
import numpy as np
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.naive_bayes import MultinomialNB
from sklearn.impute import SimpleImputer

from utilities import *

In [36]:
def preprocess(df):
    # Extract unique countries in the df
    unique_values = set()
    df['countries_in_family'].apply(lambda x: unique_values.update(x.strip("[]").replace("'", "").split())) 

    # Create new columns for each unique value
    for value in unique_values:
        # each country has a column (1 if the patent belongs to the country 0 otherwise)
        df[value] = df['countries_in_family'].apply(lambda x: 1 if value in x else 0)

    df = df[df.abstract.notna()].copy()  # drop all samples without abstract

    # Encode company names
    df['company_name_encoded'] = df.company_name.astype('category').cat.codes  # encode companies

    # Remove non-numeric columns
    df_columns_dropped = df.drop(['publication_number', 'company_name', 'countries_in_family', 'publn_nr','primary_cpc'], axis=1)

    # f0_ has the same value as commercialization, the other two shouldn't be used
    df_columns_dropped = df_columns_dropped.drop(['f0_', 'centrality', 'similarity'], axis=1)

    # Remove text as I can't compute min and max on it
    text = df_columns_dropped[['abstract', 'description_text']]  # putting them aside for later
    df_columns_dropped.drop(['abstract', 'description_text'], axis=1, inplace=True)

    df_no_missing = df_columns_dropped.fillna(df_columns_dropped.mean()).copy()

    # Extracting what we'll try to predict
    y = df_no_missing['commercialized']
    df_no_missing.drop('commercialized', axis=1, inplace=True)

    # Dropping columns where all the values are the same (min = max)
    min_eq_max = df_no_missing.columns[df_no_missing.min() == df_no_missing.max()].to_list()
    df_clean = df_no_missing.drop(min_eq_max, axis=1)

    X_train, X_test, y_train, y_test = train_test_split(df_clean, y, test_size=0.20, random_state=42)

    # Rescale
    scaler = StandardScaler()
    scaler.fit(X_train)
    X_train_scaled = scaler.transform(X_train)
    X_test_scaled = scaler.transform(X_test)

    # # Putting text back in
    # df_clean[['abstract', 'description_text']] = text 

    X_train, X_test, y_train, y_test = train_test_split(df_clean, y, test_size=0.20, random_state=42) 

    # # Same vectorizer applied to training and testing for abstract
    # vectorizer = TfidfVectorizer(max_features=1000)  # Adjust 'max_features' as needed
    # X_train_ab = encode_text_colum(X_train, 'abstract', vectorizer)
    # X_test_ab = encode_text_colum(X_test, 'abstract', vectorizer)

    # # Same vectorizer applied to training and testing for description_text
    # vectorizer = TfidfVectorizer(max_features=1000)  # Adjust 'max_features' as needed
    # X_train_de = encode_text_colum(X_train_ab, 'description_text', vectorizer)
    # X_test_de = encode_text_colum(X_test_ab, 'description_text', vectorizer)

    # return X_train_de, X_test_de, y_train, y_test
    return df_clean, y, X_train, X_test, y_train, y_test


In [3]:
df = pd.read_csv('data/modelready_220423.csv')

In [37]:
from sklearn.preprocessing import MinMaxScaler

# Apply the preprocessing and encoding function
df_clean, y, X_train, X_test, y_train, y_test = preprocess(df)

In [38]:
print(f"There aren't any NaN values in X_train_de:\n>> {X_train.isna().sum().sum() == 0}")

There aren't any NaN values in X_train_de:
>> True


In [39]:
import numpy as np
from sklearn.naive_bayes import MultinomialNB

def train_naive_bayes_model(X_train, y_train):
    # Create and fit the Naive Bayes model
    nb_classifier = MultinomialNB()
    nb_classifier.fit(X_train, y_train)

    return nb_classifier

def evaluate_model(nb_classifier, X_test, y_test):
    # Make predictions using the trained model
    y_pred = nb_classifier.predict(X_test)

    # Calculate the accuracy
    accuracy = np.mean(y_pred == y_test)
    print("Accuracy:", accuracy)

# Rescaling
scaler = MinMaxScaler()
scaler.fit(X_train)
X_train_array = scaler.transform(X_train.values)
X_test_array = scaler.transform(X_test.values)

# Load the preprocessed data
X_train = X_train_array
X_test = X_test_array

# Train the Naive Bayes model
nb_classifier = train_naive_bayes_model(X_train, y_train)

# Evaluate the model performance
evaluate_model(nb_classifier, X_test, y_test)


Accuracy: 0.8186311077955987




# Part 2

In [40]:
def modify_df(df, cols_to_drop):
    df_out = df.copy()
    df_out = df_out.drop(cols_to_drop, axis=1)
    return df_out

# def modify_df(arr, cols_to_drop):
#     arr_out = np.delete(arr, np.where(np.isin(arr.columns, cols_to_drop)), axis=1)
#     return arr_out


In [41]:
any_nan = df_clean.isna().any().any()
print("Any NaN values in the DataFrame:", any_nan)

Any NaN values in the DataFrame: False


In [42]:
df_clean['backward_citations_app']

0        140
1        203
2         69
3        162
4        208
        ... 
63342     75
63343     15
63346     64
63347      1
63348     50
Name: backward_citations_app, Length: 53616, dtype: int64

In [43]:
to_drop = ['backward_citations_app', 'backward_citations_exa'] 
modify_df(df_clean, to_drop).shape

(53616, 842)

In [48]:
import numpy as np
import pandas as pd
from sklearn.naive_bayes import MultinomialNB
from sklearn.model_selection import train_test_split

# Load preprocessed data
df_clean, y, X_train, X_test, y_train, y_test = preprocess(df)

# Function to train Naive Bayes model
def train_naive_bayes_model(X_train, y_train):
    nb_classifier = MultinomialNB()
    nb_classifier.fit(X_train, y_train)
    return nb_classifier

# Function to evaluate Naive Bayes model
def evaluate_model(nb_classifier, X_test, y_test):
    y_pred = nb_classifier.predict(X_test)
    accuracy = np.mean(y_pred == y_test)
    print("Accuracy:", accuracy)

# # Function to modify the dataframe by dropping specified columns
# def modify_df(df, cols_to_drop):
#     df_out = df.copy()
#     df_out = df_out.drop(cols_to_drop, axis=1)
#     return df_out

# Function to modify the dataframe by dropping specified columns
def modify_df(df, cols_to_drop):
    df_out = df.copy()
    df_out = df_out.drop(cols_to_drop, axis=1, errors='ignore')  # Add errors='ignore' to handle non-existing columns
    return df_out


# Iterate Over Feature Groups and Train Naive Bayes
def group_features(df_clean, threshold=0.8):
    # Compute the correlation matrix
    corr_matrix = df_clean.corr().abs()

    # Extract the upper triangle of the correlation matrix
    upper_triangle = corr_matrix.where(np.triu(np.ones(corr_matrix.shape), k=1).astype(bool))

    # Find features with correlation above the threshold
    correlated_features = [column for column in upper_triangle.columns if any(upper_triangle[column] > threshold)]

    # Create a list of groups of features with at least two variables
    grouped_features = []
    for feature in correlated_features:
        # Check if the feature is already in a group
        added = False
        for group in grouped_features:
            if feature in group:
                added = True
                break

        # If the feature is not in any group, create a new group
        if not added:
            new_group = [feature]
            # Add correlated features to the group
            for correlated_feature in correlated_features:
                if correlated_feature != feature and corr_matrix.loc[feature, correlated_feature] > threshold:
                    new_group.append(correlated_feature)
            # Only add the group if it has at least two variables
            if len(new_group) > 1:
                grouped_features.append(new_group)

    return grouped_features

# Function to iterate over feature groups, modify dataframe, and print outcome
def iterate_feature_groups(X_train, X_test, y_train, y_test, feature_groups):
    for group in feature_groups:
        print(f"Performance after removing features {group}:")

        # Modify the dataframes by dropping the current group of features
        X_train_mod = modify_df(pd.DataFrame(X_train), group)
        X_test_mod = modify_df(pd.DataFrame(X_test), group)

        # Rescaling
        scaler = MinMaxScaler()
        scaler.fit(X_train)
        X_train_array_mod = scaler.transform(X_train_mod.values)
        X_test_array_mod = scaler.transform(X_test_mod.values)

        # Load the preprocessed data
        X_train_mod = X_train_array_mod
        X_test_mod = X_test_array_mod

        # Train Naive Bayes model on modified data
        nb_classifier = train_naive_bayes_model(X_train_mod, y_train)

        # Evaluate the model on the test set
        evaluate_model(nb_classifier, X_test_mod, y_test)


### Test n.1
threshold = 0.8

(threshold indicates how correlated the features must be in order to group them together)

In [49]:
# Load preprocessed data and define feature groups
feature_groups = group_features(df_clean, threshold=0.8)

# Usage:
iterate_feature_groups(X_train_array, X_test_array, y_train, y_test, feature_groups)


Performance after removing features ['dummy_country_ZW', 'MD', 'ZW']:
Accuracy: 0.8186311077955987


### Test n.2
threshold = 0.5

(threshold indicates how correlated the features must be in order to group them together)

In [50]:
# Load preprocessed data and define feature groups
feature_groups = group_features(df_clean, threshold=0.5)

# Usage:
iterate_feature_groups(X_train_array, X_test_array, y_train, y_test, feature_groups)


Performance after removing features ['geog_family_size_x', 'geog_family_size_y', 'dummy_country_AU', 'dummy_country_BR', 'dummy_country_CA', 'dummy_country_CN', 'dummy_country_DK', 'dummy_country_EP', 'dummy_country_ES', 'dummy_country_IL', 'dummy_country_JP', 'dummy_country_KR', 'dummy_country_MX', 'dummy_country_PL', 'dummy_country_PT', 'MX', 'KR', 'PL', 'BR', 'EP', 'CN', 'IL', 'CA', 'JP', 'AU', 'PT', 'DK', 'ES']:
Accuracy: 0.8186311077955987
Performance after removing features ['dummy_country_CR', 'geog_family_size_y', 'dummy_country_EC', 'dummy_country_PE', 'CR', 'PE', 'EC']:
Accuracy: 0.8186311077955987
Performance after removing features ['dummy_country_CY', 'geog_family_size_y', 'dummy_country_SI', 'CY', 'SI']:
Accuracy: 0.8186311077955987
Performance after removing features ['dummy_country_CZ', 'dummy_country_SK', 'SK', 'CZ', 'BG']:
Accuracy: 0.8186311077955987
Performance after removing features ['dummy_country_DE', 'AT', 'DE']:
Accuracy: 0.8186311077955987
Performance after r

In [47]:
df_clean[['geog_family_size_y']]


Unnamed: 0,geog_family_size_y
0,16
1,1
2,1
3,64
4,16
...,...
63342,16
63343,4
63346,1
63347,4
