In [1]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import OneHotEncoder, StandardScaler
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import classification_report
from sklearn.impute import SimpleImputer

# Load data
train_data = pd.read_csv('train.csv')
train_labels = pd.read_csv('trainLabels.csv')[:9999]

# Drop id column
train_data = train_data.drop(columns=['id'])

# Separate features
boolean_features = []
numerical_features = []
hash_features = []

for column in train_data.columns:
    unique_values = train_data[column].unique()
    num_unique_values = len(unique_values)
    
    if len(str(train_data[column].iloc[0])) == 44: 
        hash_features.append(column)
        train_data[column] = train_data[column].fillna('NULL1')
    
    elif any(x == 'YES' for x in unique_values) and any(x == 'NO' for x in unique_values):
        boolean_features.append(column)
        most_frequent_value = train_data[column].mode()[0]
        train_data[column].fillna(most_frequent_value, inplace=True)
    
    else:
        numerical_features.append(column)
        train_data[column] = train_data[column].fillna(0.0)

# Prepare data
X = train_data
y = train_labels.drop(columns=['id'])

# Train-validation split
X_train, X_val, y_train, y_val = train_test_split(X, y, test_size=0.2, random_state=42)

# Preprocessing pipeline
preprocessor = ColumnTransformer(
    transformers=[
        ('bool', OneHotEncoder(), boolean_features), 
        ('num', StandardScaler(), numerical_features), 
    ],
    remainder='passthrough'
)

hash_transformers = []
for column in hash_features:
    hash_transformers.append((column, TfidfVectorizer(analyzer='char', lowercase=False), column))

preprocessor.transformers.extend(hash_transformers)

# Model pipeline
model = Pipeline(steps=[
    ('preprocessor', preprocessor),
    ('classifier', RandomForestClassifier(n_estimators=100))  
])

# Train model
model.fit(X_train, y_train)

# Predict and evaluate
y_pred = model.predict(X_val)
print(classification_report(y_val, y_pred, zero_division=1))  # Adjusted to handle warnings


              precision    recall  f1-score   support

           0       1.00      0.18      0.31        11
           1       1.00      0.00      0.00         1
           2       0.95      0.88      0.92        43
           3       1.00      0.74      0.85        34
           4       1.00      1.00      1.00         0
           5       0.87      0.69      0.77       151
           6       1.00      0.17      0.29        72
           7       1.00      1.00      1.00         0
           8       0.97      0.52      0.68       139
           9       1.00      0.26      0.42        34
          10       1.00      0.00      0.00         3
          11       0.79      0.63      0.70       142
          12       1.00      0.45      0.62        20
          13       1.00      1.00      1.00         0
          14       1.00      0.50      0.67         6
          15       1.00      0.57      0.73        21
          16       1.00      1.00      1.00         0
          17       1.00    

In [2]:
test_data = pd.read_csv('test.csv')
test_data = test_data.drop(test_data.columns[0], axis=1)
test_data.columns = train_data.columns
for column in test_data.columns:
    unique_values = test_data[column].unique()
    num_unique_values = len(unique_values)
    
    if len(str(test_data[column].iloc[0])) == 44: 
        hash_features.append(column)
        test_data[column] = test_data[column].fillna('NULL1')
    
    elif any(x == 'YES' for x in unique_values) and any(x == 'NO' for x in unique_values):
        boolean_features.append(column)
        most_frequent_value = test_data[column].mode()[0]
        test_data[column].fillna(most_frequent_value, inplace=True)
    
    else:
        numerical_features.append(column)
        test_data[column] = test_data[column].fillna(0.0)



In [3]:
test_predictions = model.predict(test_data)
# Drop the 'id' column from train_labels DataFrame
train_labels.drop(columns=['id'], inplace=True)

# Create submission DataFrame using test_predictions and train_labels columns
submission = pd.DataFrame(test_predictions, columns=train_labels.columns)

id_labels = [f"{i}_y" for i in range(1, len(submission) + 1)]
submission['id_label'] = id_labels
submission = submission[['id_label'] + train_labels.columns.tolist()]
submission.to_csv('sampleSubmission1.csv', index=False)


In [4]:
import pandas as pd

# Read the CSV file
df = pd.read_csv('sampleSubmission1.csv')

# Copy the last two columns
last_two_cols = df.iloc[:, -2:].copy()

# Rename the columns
last_two_cols.columns = ['y32', 'y33']

# Add the 'id_label' column
last_two_cols['id_label'] = [f"{i}_y" for i in range(1, len(last_two_cols) + 1)]

# Reorder columns
last_two_cols = last_two_cols[['id_label', 'y32', 'y33']]

# Write to a new Excel file
last_two_cols.to_excel('output1.xlsx', index=False)
