In [1]:
import pandas as pd
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.naive_bayes import MultinomialNB
from sklearn.impute import SimpleImputer

In [2]:
def encode_text_column(df, text_column, vectorizer): 
    """
    Encodes a text column with the given vectorizer, drops the old column (with text),
    and returns the dataframe with the encoded text.

    Args:
        df (pd.DataFrame): The dataframe.
        text_column (str): The name of the text column to be encoded.
        vectorizer: The text vectorizer.

    Returns:
        pd.DataFrame: The dataframe with the encoded text.
    """
    # Replace NaN values with an empty string
    df[text_column] = df[text_column].fillna('')

    # Vectorize the text column
    df_vectorized = vectorizer.fit_transform(df[text_column])
    
    # Create a dataframe from the vectorized data
    df_encoded = pd.DataFrame(df_vectorized.toarray(), columns=[f"{text_column}_{i}" for i in range(df_vectorized.shape[1])])

    # Drop the original text column
    df.drop([text_column], axis=1, inplace=True)

    # Concatenate the original dataframe with the encoded text dataframe
    df = pd.concat([df, df_encoded], axis=1)

    # Ensure all column names are strings
    df.columns = df.columns.astype(str)

    return df


In [3]:
df = pd.read_csv('data/modelready_220423.csv')

In [4]:
def preprocess_and_encode(df):
    # Extract unique countries in the df
    unique_values = set()
    df['countries_in_family'].apply(lambda x: unique_values.update(x.strip("[]").replace("'", "").split())) 

    # Create new columns for each unique value
    for value in unique_values:
        # each country has a column (1 if the patent belongs to the country 0 otherwise)
        df[value] = df['countries_in_family'].apply(lambda x: 1 if value in x else 0)

    # Drop samples without abstract
    df = df[df.abstract.notna()].copy()

    # Encode company names
    df['company_name_encoded'] = df.company_name.astype('category').cat.codes  

    # Remove non-numeric columns
    df_columns_dropped = df.drop(['publication_number', 'company_name', 'countries_in_family', 'publn_nr','primary_cpc'], axis=1)

    # Drop unnecessary columns
    df_columns_dropped = df_columns_dropped.drop(['f0_', 'centrality', 'similarity'], axis=1)

    # Drop text columns temporarily
    text = df_columns_dropped[['abstract', 'description_text']]
    df_columns_dropped.drop(['abstract', 'description_text'], axis=1, inplace=True)

    # Drop columns where all values are the same
    min_eq_max = df_columns_dropped.columns[df_columns_dropped.min() == df_columns_dropped.max()].to_list()
    df_clean = df_columns_dropped.drop(min_eq_max, axis=1)


    # Impute NaN values
    imputer = SimpleImputer(strategy='mean')
    df_no_missing = pd.DataFrame(imputer.fit_transform(df_clean), columns=df_clean.columns)
    
    # Extract target variable
    y = df_no_missing['commercialized']
    df_no_missing.drop('commercialized', axis=1, inplace=True)


    # Split the data
    X_train, X_test, y_train, y_test = train_test_split(df_clean, y, test_size=0.20, random_state=42)

    # Rescale
    scaler = StandardScaler()
    scaler.fit(X_train)
    X_train_scaled = scaler.transform(X_train)
    X_test_scaled = scaler.transform(X_test)

    # Putting text back in
    df_clean[['abstract', 'description_text']] = text  

    # Split again after putting text back in
    X_train, X_test, y_train, y_test = train_test_split(df_clean, y, test_size=0.20, random_state=42)

    # Same vectorizer applied to training and testing for abstract
    vectorizer = TfidfVectorizer(max_features=1000)
    X_train_ab = encode_text_column(X_train, 'abstract', vectorizer)
    X_test_ab = encode_text_column(X_test, 'abstract', vectorizer)

    # Same vectorizer applied to training and testing for description_text
    X_train_de = encode_text_column(X_train_ab, 'description_text', vectorizer)
    X_test_de = encode_text_column(X_test_ab, 'description_text', vectorizer)

    return X_train_de, X_test_de, y_train, y_test

In [5]:
def train_naive_bayes(X_train, X_test, y_train, y_test):
    # Train a Multinomial Naive Bayes classifier
    naive_bayes_model = MultinomialNB()
    naive_bayes_model.fit(X_train, y_train)

    # Make predictions on the test set
    y_pred = naive_bayes_model.predict(X_test)

    # Evaluate the performance
    accuracy = accuracy_score(y_test, y_pred)
    classification_report_str = classification_report(y_test, y_pred)

    print(f"Accuracy: {accuracy:.2f}")
    print("Classification Report:\n", classification_report_str)

    return naive_bayes_model

In [6]:
# Apply the preprocessing and encoding function
X_train_de, X_test_de, y_train, y_test = preprocess_and_encode(df)

# Train and evaluate the Naive Bayes model
naive_bayes_model = train_naive_bayes(X_train_de, X_test_de, y_train, y_test)

ValueError: Input X contains NaN.
MultinomialNB does not accept missing values encoded as NaN natively. For supervised learning, you might want to consider sklearn.ensemble.HistGradientBoostingClassifier and Regressor which accept missing values encoded as NaNs natively. Alternatively, it is possible to preprocess the data, for instance by using an imputer transformer in a pipeline or drop samples with missing values. See https://scikit-learn.org/stable/modules/impute.html You can find a list of all estimators that handle NaN values at the following page: https://scikit-learn.org/stable/modules/impute.html#estimators-that-handle-nan-values