In [None]:
import pandas as pd
from sklearn.preprocessing import LabelEncoder
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import OneHotEncoder

# Load the train and test data
train_data = pd.read_csv('/content/Doceree-HCP_Train.csv', on_bad_lines='skip', encoding='latin1')
test_data = pd.read_csv('/content/Doceree-HCP_Test.csv', on_bad_lines='skip', encoding='latin1')

# Drop 'TAXONOMY' column from train data
train_data = train_data.drop(['TAXONOMY'], axis=1)

# Check for missing values in the train data
print('Missing values in train data:', train_data.isnull().sum().sum())

# Drop rows with missing values in the train data
train_data = train_data.dropna()

# Split the train data into X and y
X_train = train_data.drop(['USERPLATFORMUID', 'IS_HCP'], axis=1)
y_train = train_data[['IS_HCP']]

# Print the number of missing values in X_train before imputation
print('Missing values in X_train before imputation:', X_train.isnull().sum().sum())

# Fill in missing values with SimpleImputer
imputer = SimpleImputer(strategy='most_frequent')
X_train = pd.DataFrame(imputer.fit_transform(X_train), columns=X_train.columns)

# Print the number of missing values in X_train after imputation
print('Missing values in X_train after imputation:', X_train.isnull().sum().sum())

# Encode the categorical variables using OneHotEncoder
ohe = OneHotEncoder(handle_unknown='ignore')

X_train_encoded = ohe.fit_transform(X_train)

# Train a random forest classifier
clf = RandomForestClassifier(n_estimators=100, random_state=42)
clf.fit(X_train_encoded, y_train.values.ravel())

# Check for missing values in the test data
print('Missing values in test data:', test_data.isnull().sum().sum())

# Fill in missing values in the test data with SimpleImputer
X_test = test_data.drop(['USERPLATFORMUID'], axis=1)
X_test = pd.DataFrame(imputer.transform(X_test), columns=X_train.columns)

# Encode the categorical variables in the test set using OneHotEncoder
X_test_encoded = ohe.transform(X_test)

# Make predictions on the test set
y_pred = clf.predict(X_test_encoded)

# Create a dataframe with the predictions and user ids
output_df = pd.DataFrame({'USERPLATFORMUID': test_data['USERPLATFORMUID'], 'IS_HCP': y_pred})

# Search for the keyword in the 'KEYWORDS' column and print it in 'TAXONOMY' if 'IS_HCP' is 1
output_df['TAXONOMY'] = ""
for i in range(len(output_df)):
    prediction = output_df['IS_HCP'].iloc[i]
    if prediction == 1:
        user_id = output_df['USERPLATFORMUID'].iloc[i]
        keywords = train_data[train_data['USERPLATFORMUID'] == user_id]['KEYWORDS'].values
        if len(keywords) > 0:
            output_df['TAXONOMY'].iloc[i] = keywords[0]

# Save the output to a CSV file
output_df.to_csv('output.csv', index=False)

# Read the output CSV file
output_data = pd.read_csv('output.csv')
print(output_data)