In [1]:
import pandas as pd
import numpy as np
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.impute import SimpleImputer
from sklearn.model_selection import train_test_split
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score

# Load the dataset
data = pd.read_csv('train_sample.csv')

# Display the first few rows of the dataset
print("First few rows of the dataset:")
print(data.head())

# Display the summary of the dataset
print("\nSummary of the dataset:")
print(data.info())

# Check for missing values
print("\nMissing values in the dataset:")
print(data.isnull().sum())

# Replace placeholders with NaN
data.replace(['-', '?'], np.nan, inplace=True)

# Convert click_time and attributed_time to datetime
data['click_time'] = pd.to_datetime(data['click_time'])
data['attributed_time'] = pd.to_datetime(data['attributed_time'], errors='coerce')

# Extracting features from click_time
data['hour'] = data['click_time'].dt.hour
data['day'] = data['click_time'].dt.day
data['dayofweek'] = data['click_time'].dt.dayofweek

# Drop click_time and attributed_time as they are no longer needed
data.drop(columns=['click_time', 'attributed_time'], inplace=True)

# Display the first few rows after feature engineering
print("\nFirst few rows after feature engineering:")
print(data.head())

# Define the features and target
X = data.drop(columns=['is_attributed'])
y = data['is_attributed']

# Identify numerical and categorical features
numeric_features = X.select_dtypes(include=['int64', 'float64']).columns
categorical_features = X.select_dtypes(include=['object']).columns

# Preprocessing for numerical data
numeric_transformer = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='median')),
    ('scaler', StandardScaler())
])

# Preprocessing for categorical data
categorical_transformer = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='most_frequent')),
    ('onehot', OneHotEncoder(handle_unknown='ignore'))
])

# Combine preprocessing steps
preprocessor = ColumnTransformer(
    transformers=[
        ('num', numeric_transformer, numeric_features),
        ('cat', categorical_transformer, categorical_features)
    ])

# Splitting the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Check the shapes of the training and test sets
print("\nShapes of the training and test sets:")
print(f"X_train: {X_train.shape}, X_test: {X_test.shape}")
print(f"y_train: {y_train.shape}, y_test: {y_test.shape}")

# Build the model pipeline
model = Pipeline(steps=[
    ('preprocessor', preprocessor),
    ('classifier', RandomForestClassifier(random_state=42))
])

# Train the model
print("\nTraining the model...")
model.fit(X_train, y_train)

# Predict on the test set
print("\nPredicting on the test set...")
y_pred = model.predict(X_test)

# Evaluate the model
accuracy = accuracy_score(y_test, y_pred)
print(f'\nModel accuracy: {accuracy:.4f}')


First few rows of the dataset:
       ip  app  device  os  channel           click_time      attributed_time  \
0   89489    3       1  13      379  2017-11-06 15:13:23                  NaN   
1  204158   35       1  13       21  2017-11-06 15:41:07  2017-11-07 08:17:19   
2    3437    6       1  13      459  2017-11-06 15:42:32                  NaN   
3  167543    3       1  13      379  2017-11-06 15:56:17                  NaN   
4  147509    3       1  13      379  2017-11-06 15:57:01                  NaN   

   is_attributed  
0              0  
1              1  
2              0  
3              0  
4              0  

Summary of the dataset:
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 2300561 entries, 0 to 2300560
Data columns (total 8 columns):
 #   Column           Dtype 
---  ------           ----- 
 0   ip               int64 
 1   app              int64 
 2   device           int64 
 3   os               int64 
 4   channel          int64 
 5   click_time       object