In [16]:
import pandas as pd
train_df = pd.read_csv('Train.csv')
test_df = pd.read_csv('Test.csv')

In [17]:
import pandas as pd
from scipy.stats import zscore


z_scores = train_df.apply(zscore)

in_range = (z_scores >= -3) & (z_scores <= 3)

rows_in_range = in_range.all(axis=1)

train_df = train_df[rows_in_range]

num_rows = train_df.shape[0]

print(f"Number of rows with z-scores between -3 and 3 for all columns: {num_rows}")

Number of rows with z-scores between -3 and 3 for all columns: 7545335


In [18]:
import pandas as pd
from scipy.stats import zscore


z_scores = test_df.apply(zscore)

in_range = (z_scores >= -3) & (z_scores <= 3)

rows_in_range = in_range.all(axis=1)

test_df = test_df[rows_in_range]

num_rows = test_df.shape[0]

print(f"Number of rows with z-scores between -3 and 3 for all columns: {num_rows}")

Number of rows with z-scores between -3 and 3 for all columns: 3291584


In [15]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import StandardScaler
from xgboost import XGBClassifier
from imblearn.under_sampling import RandomUnderSampler
from imblearn.pipeline import Pipeline as ImbPipeline
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.metrics import classification_report

# Assuming train_df and test_df are already defined
X = train_df.drop('IncidentGrade', axis=1)
y = train_df['IncidentGrade']

# Define the ColumnTransformer for numerical features
numerical_transformer = ColumnTransformer(
    transformers=[
        ('num', Pipeline(steps=[
            ('imputer', SimpleImputer(strategy='median')),
            ('scaler', StandardScaler())
        ]), X.select_dtypes(include=['int64', 'float64']).columns)
    ])

# Define the pipeline with XGBClassifier
pipeline = ImbPipeline(steps=[
    ('preprocessor', numerical_transformer),
    ('under_sampler', RandomUnderSampler(random_state=42)),
    ('classifier', XGBClassifier(random_state=42, use_label_encoder=False, eval_metric='mlogloss'))
])

# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Fit the pipeline on the training data
pipeline.fit(X_train, y_train)

# Predict and evaluate on training data
y_pred_train = pipeline.predict(X_test)
report_train = classification_report(y_test, y_pred_train, target_names=['Class 0', 'Class 1', 'Class 2'])
print("Training Data Classification Report:\n", report_train)

# Predict and evaluate on test data
X_test_df = test_df.drop('IncidentGrade', axis=1)
y_test_df = test_df['IncidentGrade']
y_pred_test_df = pipeline.predict(X_test_df)
report_test_df = classification_report(y_test_df, y_pred_test_df, target_names=['Class 0', 'Class 1', 'Class 2'])
print("Test Data Classification Report:\n", report_test_df)
#1

Parameters: { "use_label_encoder" } are not used.



Training Data Classification Report:
               precision    recall  f1-score   support

     Class 0       0.67      0.75      0.71    642260
     Class 1       0.44      0.52      0.48    313958
     Class 2       0.82      0.63      0.71    552849

    accuracy                           0.66   1509067
   macro avg       0.64      0.63      0.63   1509067
weighted avg       0.68      0.66      0.66   1509067

Test Data Classification Report:
               precision    recall  f1-score   support

     Class 0       0.64      0.60      0.62   1365546
     Class 1       0.33      0.52      0.41    662106
     Class 2       0.80      0.62      0.70   1263932

    accuracy                           0.59   3291584
   macro avg       0.59      0.58      0.58   3291584
weighted avg       0.64      0.59      0.61   3291584



In [19]:
import pandas as pd
from sklearn.model_selection import train_test_split, cross_val_score
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import StandardScaler
from xgboost import XGBClassifier
from imblearn.under_sampling import RandomUnderSampler
from imblearn.pipeline import Pipeline as ImbPipeline
from sklearn.compose import ColumnTransformer
from sklearn.metrics import classification_report

# Assuming train_df and test_df are already defined
X = train_df.drop('IncidentGrade', axis=1)
y = train_df['IncidentGrade']

# Define the ColumnTransformer for numerical features
numerical_transformer = ColumnTransformer(
    transformers=[
        ('num', Pipeline(steps=[
            ('imputer', SimpleImputer(strategy='median')),
            ('scaler', StandardScaler())
        ]), X.select_dtypes(include=['int64', 'float64']).columns)
    ])

# Define the pipeline with XGBClassifier
pipeline = ImbPipeline(steps=[
    ('preprocessor', numerical_transformer),
    ('under_sampler', RandomUnderSampler(random_state=42)),
    ('classifier', XGBClassifier(random_state=42, use_label_encoder=False, eval_metric='mlogloss'))
])

# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Perform cross-validation
cv_scores = cross_val_score(pipeline, X_train, y_train, cv=5, scoring='accuracy')
print(f'Cross-validation scores: {cv_scores}')
print(f'Mean cross-validation score: {cv_scores.mean()}')

# Fit the pipeline on the entire training data
pipeline.fit(X_train, y_train)

# Predict and evaluate on the test data
y_pred = pipeline.predict(X_test)
report = classification_report(y_test, y_pred, target_names=['Class 0', 'Class 1', 'Class 2'])
print("Test Data Classification Report:\n", report)

# Assuming test_df is already defined and has the same structure as train_df
X_test_df = test_df.drop('IncidentGrade', axis=1)
y_test_df = test_df['IncidentGrade']

# Predict using the pipeline
y_pred_test_df = pipeline.predict(X_test_df)

# Evaluate the predictions
report_test_df = classification_report(y_test_df, y_pred_test_df, target_names=['Class 0', 'Class 1', 'Class 2'])
print("Test Data Classification Report:\n", report_test_df)


Parameters: { "use_label_encoder" } are not used.

Parameters: { "use_label_encoder" } are not used.

Parameters: { "use_label_encoder" } are not used.

Parameters: { "use_label_encoder" } are not used.

Parameters: { "use_label_encoder" } are not used.



Cross-validation scores: [0.65637389 0.65713677 0.6577108  0.65589731 0.65729801]
Mean cross-validation score: 0.656883358960969


Parameters: { "use_label_encoder" } are not used.



Test Data Classification Report:
               precision    recall  f1-score   support

     Class 0       0.67      0.75      0.71    642260
     Class 1       0.44      0.52      0.48    313958
     Class 2       0.82      0.63      0.71    552849

    accuracy                           0.66   1509067
   macro avg       0.64      0.63      0.63   1509067
weighted avg       0.68      0.66      0.66   1509067

Test Data Classification Report:
               precision    recall  f1-score   support

     Class 0       0.64      0.60      0.62   1365546
     Class 1       0.33      0.52      0.41    662106
     Class 2       0.80      0.62      0.70   1263932

    accuracy                           0.59   3291584
   macro avg       0.59      0.58      0.58   3291584
weighted avg       0.64      0.59      0.61   3291584

