In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
pd.set_option('display.max_columns', None)
from google.colab import drive
drive.mount('/content/drive')
import re
# !pip install scikeras
# from scikeras.wrappers import KerasClassifier
import warnings
warnings.filterwarnings('ignore')

Mounted at /content/drive


In [None]:
# Set the path to the directory where the data is stored
train_df = pd.read_csv('/content/drive/MyDrive/ccac_data/dev_cleaned_train.csv')
test_df = pd.read_csv('/content/drive/MyDrive/ccac_data/dev_cleaned_test.csv')

# Data Cleaning

In [None]:

def replace_with_nan_and_count(df):
    """
    This function takes a pandas DataFrame, replaces all occurrences of
    specified placeholders with NaN, and prints the count of replacements
    for each column.

    :param df: pandas DataFrame
    :return: DataFrame with the specified values replaced with NaN, and prints counts
    """
    # Define the values you want to replace with NaN
    replace_values = ['(UNK)','UNK', '(N/A)', 'NULL', '(NULL)','nan','unk','(unk)','na','null','(null)']
    replacement_dict = {value: np.nan for value in replace_values}

    # Track and print the count of replacements for each column
    for column in df.columns:
        original_non_nan = df[column].notna().sum()  # Count non-NaN before replacement
        df[column].replace(replacement_dict, inplace=True)
        new_non_nan = df[column].notna().sum()  # Count non-NaN after replacement
        replacements = original_non_nan - new_non_nan  # The difference is the count of replacements

        if replacements > 0:
            print(f"Column '{column}': {replacements} replacements made.")

    return df


df = replace_with_nan_and_count(train_df)
df_test = replace_with_nan_and_count(test_df)

Column 'HasCustomerClickedOrOpenedEmailsSixMonthsPrior': 116138 replacements made.
Column 'CustomerCity': 13649 replacements made.
Column 'CustomerState': 13620 replacements made.
Column 'CustomerZipCode': 13588 replacements made.
Column 'CustomerInstitutionAffinity': 20849 replacements made.
Column 'HasCustomerClickedOrOpenedEmailsSixMonthsPrior': 11602 replacements made.
Column 'CustomerFirstWBBActionDate': 1102 replacements made.
Column 'CustomerFirstWBBPurchaseDate': 3979 replacements made.
Column 'CustomerLastWBBActionDate': 1102 replacements made.
Column 'CustomerLastWBBPurchaseDate': 3979 replacements made.
Column 'EventRoundName': 19118 replacements made.
Column 'IsEventFinalSite': 19118 replacements made.
Column 'EventSession': 19118 replacements made.
Column 'EventBeginDate': 19118 replacements made.
Column 'EventEndDate': 19118 replacements made.
Column 'HostingInstitution': 19118 replacements made.
Column 'FacilityName': 19241 replacements made.
Column 'FacilityDescription'

In [None]:
def replace_year_with_common_date(df, columns):
    for column in columns:
        # Convert column to datetime, errors='coerce' will turn the 'year-only' values into NaT
        df[column] = pd.to_datetime(df[column], errors='coerce', format='%m/%d/%y')

        # Find rows where the column is NaT (originally year-only values)
        year_only_rows = df[column].isna()

        # Extract the year from these rows from the original DataFrame
        years = pd.to_datetime(df.loc[year_only_rows, column], format='%Y').dt.year

        # For each unique year, find the most common date and replace the year-only values
        for year in years.unique():
            if pd.isna(year):  # Skip if year is NaT
                continue
            # Filter rows with the same year and find the most common date
            common_date = df[df[column].dt.year == year][column].mode()
            if not common_date.empty:
                # Replace year-only values with the most common date for that year
                df.loc[year_only_rows & (df[column].dt.year == year), column] = common_date.iloc[0]
            else:
                # If no common date is found, optionally handle this case (e.g., replace with a default date)
                pass

        # Convert back to original string format if needed
        df[column] = df[column].dt.strftime('%m/%d/%y')

    return df

# Example usage:
columns_to_process = ['CustomerFirstWBBActionDate', 'CustomerFirstWBBPurchaseDate', 'CustomerLastWBBActionDate', 'CustomerLastWBBPurchaseDate', 'EventBeginDate', 'EventEndDate']
df = replace_year_with_common_date(df, columns_to_process)
df_test = replace_year_with_common_date(df_test, columns_to_process)

In [None]:
def values_to_lowercase(df, columns=['CustomerState', 'CustomerCity']):
    """
    Convert values in specified columns of a DataFrame to lowercase.
    If no columns are specified, default columns are converted to lowercase.

    Parameters:
    - df: pandas DataFrame
    - columns: List of column names whose values are to be converted to lowercase.
               Defaults to ['CustomerState', 'CustomerCity'] if not specified.

    Returns:
    - DataFrame with values in specified columns converted to lowercase.
    """
    for col in columns:
        if col in df.columns:
            # Convert column values to lowercase
            df[col] = df[col].str.lower()
        else:
            print(f"Column '{col}' not found in DataFrame.")

    return df

df = values_to_lowercase(df, columns=['CustomerState', 'CustomerCity'])
df_test=values_to_lowercase(df_test, columns=['CustomerState', 'CustomerCity'])

In [None]:
def standardize_customer_state_with_print(df):
    """
    Standardize the 'CustomerState' column values in the DataFrame to a uniform format
    and print the original and replacement values. Specific unwanted values are dropped.

    Parameters:
    - df: pandas DataFrame with a 'CustomerState' column.

    Returns:
    - Modified DataFrame with standardized 'CustomerState' values.
    """

    # List of specific unwanted values to drop
    values_to_drop = ['ste', '19th', 'n', '14']

    # Drop specific unwanted values
    df = df[~df['CustomerState'].isin(values_to_drop)]

    # Dictionary to map non-standard to standard state abbreviations
    state_corrections = {
        'newjersey': 'nj',
        'newyork': 'ny',
        'southcarolina': 'sc',
        'northdakota': 'nd',
        'southdakota': 'sd',
        'northcarolina': 'nc',
        'newhampshire': 'nh',
        'britishcolumbia': 'bc',
        'pensylvania': 'pa',  # Assuming this is a misspelling of 'Pennsylvania'
        'pensylvania': 'pa',
        'maryl': 'md',
        'hyattsville': 'md',
        'a': 'il',
        'north': 'nc'
        # Add more mappings as needed
    }

    # Apply corrections
    for state in df['CustomerState'].unique():
        corrected_state = state_corrections.get(state, state)
        if state != corrected_state:
            print(f"Correcting '{state}' to '{corrected_state}'")
            df['CustomerState'] = df['CustomerState'].replace(state, corrected_state)

    return df

df = standardize_customer_state_with_print(df)
df_test = standardize_customer_state_with_print(df_test)

Correcting 'nan' to 'nan'
Correcting 'a' to 'il'
Correcting 'nan' to 'nan'


In [None]:
def map_state_to_country(state):
    state_to_country = {
        'alberta': 'Canada',
        'manitoba': 'Canada',
        'qc': 'Canada',
        'n5p4j9': 'Canada',
        'zuidholland': 'Holland',
        'istanbul': 'Turkey',
        'tokyo': 'Japan',
        'kennington': 'England',
        'jal': 'Mexico',
        'wlkp': 'Other',
        'vic': 'Spain',
        'Unknown': 'Unknown'
    }
    return state_to_country.get(state.lower(), 'USA')

# Apply the function to create the new column
df['CustomerCountry'] = df['CustomerState'].apply(map_state_to_country)
df_test['CustomerCountry'] = df_test['CustomerState'].apply(map_state_to_country)

In [None]:
def remove_special_characters_and_count_all(df):
    """
    Remove all special characters from values in all columns of a DataFrame
    and print the count of changes for each column without changing column names.

    Parameters:
    - df: pandas DataFrame.

    Returns:
    - DataFrame with special characters removed from values in all columns.
    """
    change_counts = {}  # Initialize change count dictionary

    for col in df.columns:
        # Convert all columns to string to ensure the regex can be applied
        original_values = df[col].astype(str)
        cleaned_values = original_values.apply(lambda x: re.sub(r'[^a-zA-Z0-9\s]', '', x))

        # Count changes (ignoring changes that are solely due to conversion to string)
        change_counts[col] = (original_values != cleaned_values).sum()

        # Apply changes, converting back to original dtype if possible
        cleaned_series = cleaned_values.apply(lambda x: re.sub(r'[^a-zA-Z0-9\s]', '', x))
        try:
            df[col] = pd.to_numeric(cleaned_series, errors='ignore')
        except ValueError:
            df[col] = cleaned_series

    # Print change counts
    for col, count in change_counts.items():
        print(f"Column '{col}': {count} changes made.")

    return df

# Remove special characters and get change counts for all columns
df = remove_special_characters_and_count_all(df)
df_test = remove_special_characters_and_count_all(df_test)

Column 'Unnamed: 0': 0 changes made.
Column 'RecordID': 0 changes made.
Column 'ChampionshipYear': 0 changes made.
Column 'CustomerID': 0 changes made.
Column 'CustomerCity': 167 changes made.
Column 'CustomerState': 0 changes made.
Column 'CustomerZipCode': 18 changes made.
Column 'CustomerInstitutionAffinity': 172 changes made.
Column 'IsCustomerInNCAAMembership': 0 changes made.
Column 'HasCustomerClickedOrOpenedEmailsSixMonthsPrior': 0 changes made.
Column 'CustomerFirstWBBActionDate': 198193 changes made.
Column 'CustomerFirstWBBPurchaseDate': 169959 changes made.
Column 'CustomerLastWBBActionDate': 198193 changes made.
Column 'CustomerLastWBBPurchaseDate': 169959 changes made.
Column 'EventRoundName': 0 changes made.
Column 'IsEventFinalSite': 0 changes made.
Column 'EventSession': 9420 changes made.
Column 'EventBeginDate': 17955 changes made.
Column 'EventEndDate': 17955 changes made.
Column 'HostingInstitution': 4549 changes made.
Column 'FacilityName': 1967 changes made.
Colu

In [None]:
  def fill_categorical_nans(df):
      """
      Fill NaN, NULL, and variations with 'unknown' in all categorical columns of the DataFrame.

      Parameters:
      - df: pandas DataFrame.

      Returns:
      - DataFrame with 'unknown' filled in for NaNs in categorical columns.
      """

      # Define variations of null values that should be considered
      null_variations = ['NaN', 'NULL', 'null', '(null)', 'N/A', 'n/a', 'na', '-', '']

      # Replace each null variation with 'unknown' in object dtype columns
      for col in df.columns:
          df[col] = df[col].apply(lambda x: 'Unknown' if str(x).strip() in null_variations else x)

      return df

  # Fill NaN and variations in categorical columns with 'unknown'
  df = fill_categorical_nans(df)
  df_test = fill_categorical_nans(df_test)

In [None]:
# df.to_csv('/content/drive/MyDrive/ccac_data/train_clean.csv', index=False)
# df_test.to_csv('/content/drive/MyDrive/ccac_data/test_clean.csv', index=False)

# ensemble 1

In [None]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.metrics import precision_score, recall_score, accuracy_score, classification_report, f1_score
from sklearn.preprocessing import OneHotEncoder, StandardScaler, LabelEncoder
from sklearn.compose import ColumnTransformer
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier, VotingClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.utils.class_weight import compute_class_weight
from imblearn.over_sampling import SMOTE
from imblearn.pipeline import Pipeline as ImbPipeline
from xgboost import XGBClassifier

# Assuming 'df' and 'df_test' are already loaded with your data

# Define preprocessing
categorical_columns = df.drop('ActivityType', axis=1).select_dtypes(include=['object']).columns.tolist()
numeric_columns = df.drop('ActivityType', axis=1).select_dtypes(include=['int64', 'float64']).columns.tolist()
preprocessor = ColumnTransformer(transformers=[
    ('cat', OneHotEncoder(handle_unknown='ignore'), categorical_columns),
    ('num', StandardScaler(), numeric_columns)
], remainder='passthrough')

# Encode the target variable
label_encoder = LabelEncoder()
y_encoded = label_encoder.fit_transform(df['ActivityType'])
X_preprocessed = preprocessor.fit_transform(df.drop('ActivityType', axis=1))
X_train, X_val, y_train_encoded, y_val_encoded = train_test_split(X_preprocessed, y_encoded, test_size=0.2, random_state=42)

# Define individual estimators
estimator1 = ('rf', RandomForestClassifier(n_estimators=100, random_state=42))
estimator2 = ('gb', GradientBoostingClassifier(n_estimators=100, random_state=42))
estimator3 = ('xgb', XGBClassifier(random_state=42, use_label_encoder=False, eval_metric='mlogloss'))

# Create a VotingClassifier with soft voting
voting_clf = VotingClassifier(estimators=[estimator1, estimator2, estimator3], voting='soft')

# Create a pipeline with SMOTE and the VotingClassifier
ensemble_pipeline = ImbPipeline(steps=[
    ('smote', SMOTE(random_state=42)),
    ('voting', voting_clf)
])

# Define a parameter grid to search over (simplified for demonstration)
param_grid = {
    'voting__rf__max_depth': [10, None],
    'voting__gb__learning_rate': [0.01, 0.1],
    # Add more parameters as needed
}

# Initialize the GridSearchCV object
grid_search = GridSearchCV(ensemble_pipeline, param_grid, cv=3, scoring='accuracy', n_jobs=-1)

# Fit the grid search to the data
grid_search.fit(X_train, y_train_encoded)

# Print the best parameters and the corresponding score
print("Best parameters found: ", grid_search.best_params_)
print("Best accuracy found: ", grid_search.best_score_)

# Predict and evaluate using the best found parameters
y_pred_val_encoded = grid_search.predict(X_val)
print(f'Validation Accuracy: {accuracy_score(y_val_encoded, y_pred_val_encoded)}')
print(classification_report(y_val_encoded, y_pred_val_encoded))

# Predict on the test data
X_test_preprocessed = preprocessor.transform(df_test)  # Ensure df_test is preprocessed similarly
test_predictions_encoded = grid_search.predict(X_test_preprocessed)
df_test['ActivityType'] = label_encoder.inverse_transform(test_predictions_encoded)

# Output the prediction in the desired format
output_df = df_test[['RecordID', 'ActivityType']]
output_df.to_csv('/content/drive/MyDrive/ccac_data/ensemble_predictions_1.csv', index=False)
print(output_df.head())

In [None]:
print(f'Precision: {precision_score(y_val_encoded, y_pred_val_encoded, average="macro")}')
print(f'Recall: {recall_score(y_val_encoded, y_pred_val_encoded, average="macro")}')
print(f'F1 Score: {f1_score(y_val_encoded, y_pred_val_encoded, average="macro")}')
print(f'Train Accuracy: {accuracy_score(y_train_encoded, pipeline.predict(X_train))}')
print(f'Validation Accuracy: {accuracy_score(y_val_encoded, y_pred_val_encoded)}')

In [None]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.metrics import precision_score, recall_score, accuracy_score, classification_report, f1_score
from sklearn.preprocessing import OneHotEncoder, StandardScaler, LabelEncoder
from sklearn.compose import ColumnTransformer
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier, VotingClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.utils.class_weight import compute_class_weight
from imblearn.over_sampling import SMOTE
from imblearn.pipeline import Pipeline as ImbPipeline
from xgboost import XGBClassifier

# Define preprocessing
categorical_columns = df.drop('ActivityType', axis=1).select_dtypes(include=['object']).columns.tolist()
numeric_columns = df.drop('ActivityType', axis=1).select_dtypes(include=['int64', 'float64']).columns.tolist()
preprocessor = ColumnTransformer(transformers=[
    ('cat', OneHotEncoder(handle_unknown='ignore'), categorical_columns),
    ('num', StandardScaler(), numeric_columns)
], remainder='passthrough')

# Encode the target variable
label_encoder = LabelEncoder()
y_encoded = label_encoder.fit_transform(df['ActivityType'])
X_preprocessed = preprocessor.fit_transform(df.drop('ActivityType', axis=1))
X_train, X_val, y_train_encoded, y_val_encoded = train_test_split(X_preprocessed, y_encoded, test_size=0.2, random_state=42)

# Define individual estimators
estimator1 = RandomForestClassifier(n_estimators=100, random_state=42)
estimator2 = GradientBoostingClassifier(n_estimators=100, random_state=42)
estimator3 = XGBClassifier(random_state=42, use_label_encoder=False, eval_metric='mlogloss')

# Create a VotingClassifier with soft voting
voting_clf = VotingClassifier(estimators=[
    ('rf', estimator1),
    ('gb', estimator2),
    ('xgb', estimator3)],
    voting='soft')

# Create a pipeline with SMOTE and the VotingClassifier
ensemble_pipeline = ImbPipeline(steps=[
    ('smote', SMOTE(random_state=42)),
    ('voting', voting_clf)
])

# Train the model on the training set
ensemble_pipeline.fit(X_train, y_train_encoded)

# Predict on the validation set
y_pred_val_encoded = ensemble_pipeline.predict(X_val)

# Print precision, recall, f1-score, and accuracy for the validation set using the encoded labels
print(f'Precision: {precision_score(y_val_encoded, y_pred_val_encoded, average="macro")}')
print(f'Recall: {recall_score(y_val_encoded, y_pred_val_encoded, average="macro")}')
print(f'F1 Score: {f1_score(y_val_encoded, y_pred_val_encoded, average="macro")}')
print(f'Train Accuracy: {accuracy_score(y_train_encoded, ensemble_pipeline.predict(X_train))}')
print(f'Validation Accuracy: {accuracy_score(y_val_encoded, y_pred_val_encoded)}')
print(classification_report(y_val_encoded, y_pred_val_encoded))

# Predict on the test data using the trained pipeline
X_test_preprocessed = preprocessor.transform(df_test)
test_predictions_encoded = ensemble_pipeline.predict(X_test_preprocessed)
df_test['ActivityType'] = label_encoder.inverse_transform(test_predictions_encoded)

# Output the prediction in the desired format
output_df = df_test[['RecordID', 'ActivityType']]
output_df.to_csv('/content/drive/MyDrive/ccac_data/ensemble_predictions_1.csv', index=False)
print(output_df.head())


Precision: 0.5780253768667726
Recall: 0.6226365308334768
F1 Score: 0.5825139217005352
Train Accuracy: 0.9959081514402113
Validation Accuracy: 0.9829159896779126
              precision    recall  f1-score   support

           0       0.33      0.44      0.38       142
           1       1.00      1.00      1.00     38237
           2       0.22      0.11      0.14        19
           3       0.97      0.87      0.92      2612
           4       0.28      0.66      0.40       171
           5       0.66      0.65      0.66       671

    accuracy                           0.98     41852
   macro avg       0.58      0.62      0.58     41852
weighted avg       0.99      0.98      0.98     41852

   RecordID ActivityType
0     26355  No Activity
1     26641  No Activity
2     26836  No Activity
3     26900  No Activity
4     26948  No Activity


# ensemble 2

In [None]:
from sklearn.model_selection import train_test_split
from sklearn.metrics import precision_score, recall_score, accuracy_score, classification_report, f1_score
from sklearn.preprocessing import OneHotEncoder, StandardScaler, LabelEncoder
from sklearn.compose import ColumnTransformer
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier, VotingClassifier
from sklearn.utils.class_weight import compute_class_weight
from imblearn.over_sampling import SMOTE
from imblearn.under_sampling import RandomUnderSampler
from imblearn.pipeline import Pipeline as ImbPipeline
from xgboost import XGBClassifier
import pandas as pd
import numpy as np

# Define preprocessing
categorical_columns = df.drop('ActivityType', axis=1).select_dtypes(include=['object']).columns.tolist()
numeric_columns = df.drop('ActivityType', axis=1).select_dtypes(include=['int64', 'float64']).columns.tolist()


preprocessor = ColumnTransformer(transformers=[
    ('cat', OneHotEncoder(handle_unknown='ignore'), categorical_columns),
    ('num', StandardScaler(), numeric_columns)
], remainder='passthrough')

label_encoder = LabelEncoder()
y_encoded = label_encoder.fit_transform(df['ActivityType'])
X_preprocessed = preprocessor.fit_transform(df.drop('ActivityType', axis=1))

# Splitting the data
X_train, X_val, y_train_encoded, y_val_encoded = train_test_split(X_preprocessed, y_encoded, test_size=0.2, random_state=42)

# Compute class weights for imbalance handling
class_weights = compute_class_weight('balanced', classes=np.unique(y_train_encoded), y=y_train_encoded)
class_weights_dict = dict(enumerate(class_weights))

# Update individual estimators with class_weight where applicable
estimator1 = RandomForestClassifier(n_estimators=100, random_state=42, class_weight=class_weights_dict)
estimator2 = GradientBoostingClassifier(n_estimators=100, random_state=42)  # No class_weight parameter
estimator3 = XGBClassifier(random_state=42, use_label_encoder=False, eval_metric='mlogloss')



# Create a VotingClassifier with soft voting
voting_clf = VotingClassifier(estimators=[
    ('rf', estimator1),
    ('gb', estimator2),
    ('xgb', estimator3)],
    voting='soft')

from collections import Counter

# Count the occurrences of each class in the target variable
class_counts = Counter(y_train_encoded)

# Example: Increase the sample size of minority classes with SMOTE
# Decrease the sample size of the majority class with RandomUnderSampler
# This is a placeholder; adjust according to your specific needs
sampling_strategy_smote = {class_label: int(count * 1.5) for class_label, count in class_counts.items() if count / len(y_train_encoded) < 0.91}
sampling_strategy_under = {class_label: int(count * 0.5) for class_label, count in class_counts.items() if count / len(y_train_encoded) >= 0.91}

# Update the pipeline steps for SMOTE and RandomUnderSampler
imbalance_pipeline = ImbPipeline(steps=[
    ('smote', SMOTE(random_state=42, sampling_strategy=sampling_strategy_smote)),  # Adjust for multi-class
    ('under', RandomUnderSampler(random_state=42, sampling_strategy=sampling_strategy_under)),  # Adjust for multi-class
    ('voting', voting_clf)
])


# Train the model on the training set
imbalance_pipeline.fit(X_train, y_train_encoded)

# Predict on the validation set
y_pred_val_encoded = imbalance_pipeline.predict(X_val)

# Print evaluation metrics for the validation set
print(f'Precision: {precision_score(y_val_encoded, y_pred_val_encoded, average="macro")}')
print(f'Recall: {recall_score(y_val_encoded, y_pred_val_encoded, average="macro")}')
print(f'F1 Score: {f1_score(y_val_encoded, y_pred_val_encoded, average="macro")}')
print(f'Train Accuracy: {accuracy_score(y_train_encoded, imbalance_pipeline.predict(X_train))}')
print(f'Validation Accuracy: {accuracy_score(y_val_encoded, y_pred_val_encoded)}')
print(classification_report(y_val_encoded, y_pred_val_encoded))

# Predict on the test data using the trained pipeline
X_test_preprocessed = preprocessor.transform(df_test)
test_predictions_encoded = imbalance_pipeline.predict(X_test_preprocessed)
df_test['ActivityType'] = label_encoder.inverse_transform(test_predictions_encoded)


Precision: 0.5956815404980214
Recall: 0.5006641849828212
F1 Score: 0.5102475204569987
Train Accuracy: 0.9947074776292367
Validation Accuracy: 0.9868584535983943
              precision    recall  f1-score   support

           0       0.44      0.16      0.24       142
           1       1.00      1.00      1.00     38237
           2       0.20      0.05      0.08        19
           3       0.91      0.96      0.93      2612
           4       0.35      0.05      0.08       171
           5       0.67      0.78      0.72       671

    accuracy                           0.99     41852
   macro avg       0.60      0.50      0.51     41852
weighted avg       0.98      0.99      0.98     41852

   RecordID ActivityType
0     26355  No Activity
1     26641  No Activity
2     26836  No Activity
3     26900  No Activity
4     26948  No Activity


# xgb

In [None]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.metrics import precision_score, recall_score, accuracy_score, classification_report, f1_score
from sklearn.preprocessing import OneHotEncoder, StandardScaler, LabelEncoder
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from imblearn.over_sampling import SMOTE
from imblearn.under_sampling import RandomUnderSampler
from imblearn.pipeline import Pipeline as ImbPipeline
from xgboost import XGBClassifier

# Assuming 'df' is your DataFrame and 'ActivityType' is your target column
X = df.drop('ActivityType', axis=1)  # Features
y = df['ActivityType']  # Target

# Define preprocessing for numeric and categorical features
categorical_columns = X.select_dtypes(include=['object']).columns.tolist()
numeric_columns = X.select_dtypes(include=['int64', 'float64']).columns.tolist()

preprocessor = ColumnTransformer(transformers=[
    ('cat', OneHotEncoder(handle_unknown='ignore'), categorical_columns),
    ('num', StandardScaler(), numeric_columns)
], remainder='passthrough')

# Encode the target variable
label_encoder = LabelEncoder()
y_encoded = label_encoder.fit_transform(y)

# Preprocess the features
X_preprocessed = preprocessor.fit_transform(X)

# Split the dataset into training and validation sets
X_train, X_val, y_train_encoded, y_val_encoded = train_test_split(X_preprocessed, y_encoded, test_size=0.2, random_state=42)

# Initialize XGBClassifier with some predefined parameters
xgb_clf = XGBClassifier(use_label_encoder=False, eval_metric='mlogloss', n_estimators=100, learning_rate=0.1, max_depth=4, subsample=0.8, colsample_bytree=0.8)

# Enhanced imbalance handling using a combined SMOTE and RandomUnderSampler approach
smote = SMOTE(sampling_strategy='auto', random_state=42)
under = RandomUnderSampler(sampling_strategy='auto', random_state=42)
steps = [('smote', smote), ('under', under), ('model', xgb_clf)]
pipeline = ImbPipeline(steps=steps)

# Train the model
pipeline.fit(X_train, y_train_encoded)

# Predictions with the trained model
y_pred_train = pipeline.predict(X_train)
y_pred_val = pipeline.predict(X_val)

# Evaluation with the trained model
print(f'Train Precision: {precision_score(y_train_encoded, y_pred_train, average="macro")}')
print(f'Validation Precision: {precision_score(y_val_encoded, y_pred_val, average="macro")}')
print(f'Train Recall: {recall_score(y_train_encoded, y_pred_train, average="macro")}')
print(f'Validation Recall: {recall_score(y_val_encoded, y_pred_val, average="macro")}')
print(f'Train F1 Score: {f1_score(y_train_encoded, y_pred_train, average="macro")}')
print(f'Validation F1 Score: {f1_score(y_val_encoded, y_pred_val, average="macro")}')
print(f'Train Accuracy: {accuracy_score(y_train_encoded, y_pred_train)}')
print(f'Validation Accuracy: {accuracy_score(y_val_encoded, y_pred_val)}')
print('\nValidation Classification Report:\n', classification_report(y_val_encoded, y_pred_val))


# Assuming df_test is your test DataFrame and it has been processed similarly to how X_train was processed
X_test = df_test
X_test_preprocessed = preprocessor.transform(X_test)  # Use the preprocessor to transform the test data

# Predict the 'ActivityType' using the trained pipeline
y_test_pred_encoded = pipeline.predict(X_test_preprocessed)

# Decode the predicted labels back to original labels
y_test_pred = label_encoder.inverse_transform(y_test_pred_encoded)

# Create the output DataFrame
output_df = pd.DataFrame({
    'RecordID': df_test['RecordID'],
    'ActivityType': y_test_pred  # Adjust column name as per your requirement
})

# Save the output DataFrame to a CSV file
output_df.to_csv('/content/drive/MyDrive/ccac_data/xgb_2.csv', index=False)  # Adjust the path as per your environment

print(output_df.head())

Train Precision: 0.638149439633978
Validation Precision: 0.6327864376774649
Train Recall: 0.6833279831690331
Validation Recall: 0.6296562108075059
Train F1 Score: 0.6047858836320931
Validation F1 Score: 0.5527872748485475
Train Accuracy: 0.976828787498656
Validation Accuracy: 0.9763452164771098

Validation Classification Report:
               precision    recall  f1-score   support

           0       0.30      0.54      0.38       142
           1       1.00      1.00      1.00     38237
           2       0.67      0.11      0.18        19
           3       0.99      0.79      0.88      2612
           4       0.19      0.85      0.31       171
           5       0.65      0.49      0.56       671

    accuracy                           0.98     41852
   macro avg       0.63      0.63      0.55     41852
weighted avg       0.99      0.98      0.98     41852



In [None]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.metrics import precision_score, recall_score, accuracy_score, classification_report, f1_score
from sklearn.preprocessing import OneHotEncoder, StandardScaler, LabelEncoder
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from imblearn.over_sampling import SMOTE
from imblearn.under_sampling import RandomUnderSampler
from imblearn.pipeline import Pipeline as ImbPipeline
from xgboost import XGBClassifier

# Assuming 'df' is your DataFrame and 'ActivityType' is your target column
X = df.drop('ActivityType', axis=1)  # Features
y = df['ActivityType']  # Target

# Define preprocessing for numeric and categorical features
categorical_columns = X.select_dtypes(include=['object']).columns.tolist()
numeric_columns = X.select_dtypes(include=['int64', 'float64']).columns.tolist()

preprocessor = ColumnTransformer(transformers=[
    ('cat', OneHotEncoder(handle_unknown='ignore'), categorical_columns),
    ('num', StandardScaler(), numeric_columns)
], remainder='passthrough')

# Initialize XGBClassifier with some predefined parameters
xgb_clf = XGBClassifier(use_label_encoder=False, eval_metric='mlogloss')

# Enhanced imbalance handling using a combined SMOTE and RandomUnderSampler approach
smote = SMOTE(sampling_strategy='auto', random_state=42)
under = RandomUnderSampler(sampling_strategy='auto', random_state=42)
steps = [('preprocessor', preprocessor), ('smote', smote), ('under', under), ('model', xgb_clf)]
pipeline = ImbPipeline(steps=steps)

# Grid of parameters to choose from
parameters = {
    'model__n_estimators': [100, 200],
    'model__learning_rate': [0.01, 0.1],
    'model__max_depth': [3, 4, 5],
    'model__subsample': [0.7, 0.8],
    'model__colsample_bytree': [0.7, 0.8],
}

# Setup the grid search
grid_search = GridSearchCV(pipeline, param_grid=parameters, cv=3, scoring='accuracy', verbose=2)

# Encode the target variable
label_encoder = LabelEncoder()
y_encoded = label_encoder.fit_transform(y)

# Fit the grid search to the data
grid_search.fit(X, y_encoded)

# Best parameter set
print(f'Best parameters found: {grid_search.best_params_}')

# Use the best estimator for further predictions
best_estimator = grid_search.best_estimator_

# Predictions with the trained model (on a separate test set, if available)
# Note: Ensure you have a separate test set or split your data accordingly

# Example to demonstrate how you would perform predictions with the best estimator
# y_test_pred_encoded = best_estimator.predict(X_test_preprocessed)
# y_test_pred = label_encoder.inverse_transform(y_test_pred_encoded)
# Proceed with evaluating the model as needed


Fitting 3 folds for each of 48 candidates, totalling 144 fits
[CV] END model__colsample_bytree=0.7, model__learning_rate=0.01, model__max_depth=3, model__n_estimators=100, model__subsample=0.7; total time= 1.6min
[CV] END model__colsample_bytree=0.7, model__learning_rate=0.01, model__max_depth=3, model__n_estimators=100, model__subsample=0.7; total time= 1.5min
[CV] END model__colsample_bytree=0.7, model__learning_rate=0.01, model__max_depth=3, model__n_estimators=100, model__subsample=0.7; total time= 1.5min
[CV] END model__colsample_bytree=0.7, model__learning_rate=0.01, model__max_depth=3, model__n_estimators=100, model__subsample=0.8; total time= 1.6min
[CV] END model__colsample_bytree=0.7, model__learning_rate=0.01, model__max_depth=3, model__n_estimators=100, model__subsample=0.8; total time= 1.5min
[CV] END model__colsample_bytree=0.7, model__learning_rate=0.01, model__max_depth=3, model__n_estimators=100, model__subsample=0.8; total time= 1.5min
[CV] END model__colsample_bytree

In [None]:
from sklearn.model_selection import train_test_split
from sklearn.metrics import precision_score, recall_score, f1_score, accuracy_score, classification_report

# Split the dataset into training and validation sets
X_train, X_val, y_train, y_val = train_test_split(X, y_encoded, test_size=0.2, random_state=42)

# Fit the model on the training set
best_estimator.fit(X_train, y_train)

# Predict on the validation set
y_val_pred_encoded = best_estimator.predict(X_val)

# Convert encoded labels back to original labels for reporting
y_val_pred = label_encoder.inverse_transform(y_val_pred_encoded)
y_val_original = label_encoder.inverse_transform(y_val)

# Calculate evaluation metrics for the validation set
print(f'Precision: {precision_score(y_val, y_val_pred_encoded, average="macro")}')
print(f'Recall: {recall_score(y_val, y_val_pred_encoded, average="macro")}')
print(f'F1 Score: {f1_score(y_val, y_val_pred_encoded, average="macro")}')
print(f'Validation Accuracy: {accuracy_score(y_val, y_val_pred_encoded)}')
print(classification_report(y_val_original, y_val_pred))

# Since y_train is used for fitting, you can't directly get a train accuracy without predicting it again,
# which is redundant for this scenario. Usually, validation metrics are sufficient to understand model performance.


# rf

In [None]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.metrics import precision_score, recall_score, accuracy_score, classification_report, f1_score
from sklearn.preprocessing import OneHotEncoder, StandardScaler, LabelEncoder
from sklearn.compose import ColumnTransformer
from sklearn.ensemble import RandomForestClassifier
from sklearn.utils.class_weight import compute_class_weight
from collections import Counter

# Assuming 'df' is your DataFrame and 'ActivityType' is your target column
X = df.drop('ActivityType', axis=1)  # Features
y = df['ActivityType']  # Target

# Define preprocessing for numeric and categorical features
categorical_columns = X.select_dtypes(include=['object']).columns.tolist()
numeric_columns = X.select_dtypes(include=['int64', 'float64']).columns.tolist()

preprocessor = ColumnTransformer(transformers=[
    ('cat', OneHotEncoder(handle_unknown='ignore'), categorical_columns),
    ('num', StandardScaler(), numeric_columns)
], remainder='passthrough')

# Encode the target variable
label_encoder = LabelEncoder()
y_encoded = label_encoder.fit_transform(y)

# Preprocess the features
X_preprocessed = preprocessor.fit_transform(X)

# Split the dataset into training and validation sets
X_train, X_val, y_train_encoded, y_val_encoded = train_test_split(X_preprocessed, y_encoded, test_size=0.2, random_state=42)

# Compute class weights for imbalance handling
class_weights = compute_class_weight('balanced', classes=np.unique(y_train_encoded), y=y_train_encoded)
class_weights_dict = dict(enumerate(class_weights))

# Initialize and train the RandomForestClassifier
rf_clf = RandomForestClassifier(n_estimators=100, random_state=42, class_weight=class_weights_dict)
rf_clf.fit(X_train, y_train_encoded)

# Predict on training and validation sets
rf_pred_train = rf_clf.predict(X_train)
rf_pred_val = rf_clf.predict(X_val)

# Evaluation metrics
print(f'Train Precision: {precision_score(y_train_encoded, rf_pred_train, average="macro")}')
print(f'Validation Precision: {precision_score(y_val_encoded, rf_pred_val, average="macro")}')
print(f'Train Recall: {recall_score(y_train_encoded, rf_pred_train, average="macro")}')
print(f'Validation Recall: {recall_score(y_val_encoded, rf_pred_val, average="macro")}')
print(f'Train F1 Score: {f1_score(y_train_encoded, rf_pred_train, average="macro")}')
print(f'Validation F1 Score: {f1_score(y_val_encoded, rf_pred_val, average="macro")}')
print(f'Train Accuracy: {accuracy_score(y_train_encoded, rf_pred_train)}')
print(f'Validation Accuracy: {accuracy_score(y_val_encoded, rf_pred_val)}')
print('\nValidation Classification Report:\n', classification_report(y_val_encoded, rf_pred_val))

# Validation Classification Report:
#                precision    recall  f1-score   support

#            0       0.29      0.16      0.21       142
#            1       1.00      1.00      1.00     38237
#            2       0.50      0.11      0.17        19
#            3       0.89      0.95      0.92      2612
#            4       0.21      0.05      0.08       171
#            5       0.65      0.70      0.67       671

#     accuracy                           0.98     41852
#    macro avg       0.59      0.49      0.51     41852
# weighted avg       0.98      0.98      0.98     41852

# logistic

In [None]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.metrics import precision_score, recall_score, accuracy_score, classification_report, f1_score
from sklearn.preprocessing import OneHotEncoder, StandardScaler, LabelEncoder
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from imblearn.over_sampling import SMOTE
from imblearn.under_sampling import RandomUnderSampler
from imblearn.pipeline import Pipeline as ImbPipeline
from sklearn.linear_model import LogisticRegression

# Assuming 'df' is your DataFrame and 'ActivityType' is your target column
X = df.drop('ActivityType', axis=1)  # Features
y = df['ActivityType']  # Target

# Define preprocessing for numeric and categorical features
categorical_columns = X.select_dtypes(include=['object']).columns.tolist()
numeric_columns = X.select_dtypes(include=['int64', 'float64']).columns.tolist()

preprocessor = ColumnTransformer(transformers=[
    ('cat', OneHotEncoder(handle_unknown='ignore'), categorical_columns),
    ('num', StandardScaler(), numeric_columns)
], remainder='passthrough')

# Encode the target variable
label_encoder = LabelEncoder()
y_encoded = label_encoder.fit_transform(y)

# Preprocess the features
X_preprocessed = preprocessor.fit_transform(X)

# Split the dataset into training and validation sets
X_train, X_val, y_train_encoded, y_val_encoded = train_test_split(X_preprocessed, y_encoded, test_size=0.2, random_state=42)

# Initialize LogisticRegression with some predefined parameters
log_reg = LogisticRegression(max_iter=1000)

# Enhanced imbalance handling using a combined SMOTE and RandomUnderSampler approach
smote = SMOTE(sampling_strategy='auto', random_state=42)
under = RandomUnderSampler(sampling_strategy='auto', random_state=42)
steps = [('smote', smote), ('under', under), ('model', log_reg)]
pipeline = ImbPipeline(steps=steps)

# Train the model
pipeline.fit(X_train, y_train_encoded)

# Predictions with the trained model
y_pred_train = pipeline.predict(X_train)
y_pred_val = pipeline.predict(X_val)

# Evaluation with the trained model
print(f'Train Precision: {precision_score(y_train_encoded, y_pred_train, average="macro")}')
print(f'Validation Precision: {precision_score(y_val_encoded, y_pred_val, average="macro")}')
print(f'Train Recall: {recall_score(y_train_encoded, y_pred_train, average="macro")}')
print(f'Validation Recall: {recall_score(y_val_encoded, y_pred_val, average="macro")}')
print(f'Train F1 Score: {f1_score(y_train_encoded, y_pred_train, average="macro")}')
print(f'Validation F1 Score: {f1_score(y_val_encoded, y_pred_val, average="macro")}')
print(f'Train Accuracy: {accuracy_score(y_train_encoded, y_pred_train)}')
print(f'Validation Accuracy: {accuracy_score(y_val_encoded, y_pred_val)}')
print('\nValidation Classification Report:\n', classification_report(y_val_encoded, y_pred_val))


Train Precision: 0.6578550722710735
Validation Precision: 0.5612309151110478
Train Recall: 0.8551535980215063
Validation Recall: 0.6276773013644936
Train F1 Score: 0.6984487825875138
Validation F1 Score: 0.5721700442716308
Train Accuracy: 0.9885129565248557
Validation Accuracy: 0.9832982892095957

Validation Classification Report:
               precision    recall  f1-score   support

           0       0.32      0.65      0.43       142
           1       1.00      1.00      1.00     38237
           2       0.03      0.16      0.05        19
           3       0.97      0.89      0.93      2612
           4       0.29      0.46      0.35       171
           5       0.77      0.60      0.68       671

    accuracy                           0.98     41852
   macro avg       0.56      0.63      0.57     41852
weighted avg       0.99      0.98      0.99     41852



In [None]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.metrics import precision_score, recall_score, accuracy_score, classification_report, f1_score
from sklearn.preprocessing import OneHotEncoder, StandardScaler, LabelEncoder
from sklearn.compose import ColumnTransformer
from sklearn.linear_model import LogisticRegression
from imblearn.over_sampling import SMOTE
from imblearn.under_sampling import RandomUnderSampler
from imblearn.pipeline import Pipeline as ImbPipeline

# Load your dataset
# df = pd.read_csv('path_to_your_dataset.csv')

# Define preprocessing for numeric and categorical features
categorical_columns = df.drop('ActivityType', axis=1).select_dtypes(include=['object']).columns.tolist()
numeric_columns = df.drop('ActivityType', axis=1).select_dtypes(include=['int64', 'float64']).columns.tolist()

preprocessor = ColumnTransformer(transformers=[
    ('cat', OneHotEncoder(handle_unknown='ignore'), categorical_columns),
    ('num', StandardScaler(), numeric_columns)
], remainder='passthrough')

# Encode the target variable
label_encoder = LabelEncoder()
y_encoded = label_encoder.fit_transform(df['ActivityType'])

# Split the dataset into training and validation sets
X_train, X_val, y_train_encoded, y_val_encoded = train_test_split(
    df.drop('ActivityType', axis=1),  # Features
    y_encoded,  # Encoded target
    test_size=0.2,
    random_state=42
)

# Initialize LogisticRegression with balanced class weights and the 'multinomial' option
log_reg = LogisticRegression(max_iter=1000, solver='saga', multi_class='multinomial', class_weight='balanced', random_state=42)

# Enhanced imbalance handling using a combined SMOTE and RandomUnderSampler approach
smote = SMOTE(sampling_strategy='auto', random_state=42, k_neighbors=5, n_jobs=1)  # Adjust k_neighbors if needed
under = RandomUnderSampler(sampling_strategy='auto', random_state=42)

steps = [
    ('preprocessor', preprocessor),  # Preprocess the data
    ('smote', smote),  # SMOTE for oversampling
    ('under', under),  # RandomUnderSampler for undersampling
    ('model', log_reg)  # Logistic Regression model
]
pipeline = ImbPipeline(steps=steps)

# Train the model
pipeline.fit(X_train, y_train_encoded)

# Predictions with the trained model
y_pred_train = pipeline.predict(X_train)
y_pred_val = pipeline.predict(X_val)

# Evaluation with the trained model
print(f'Train Precision: {precision_score(y_train_encoded, y_pred_train, average="macro")}')
print(f'Validation Precision: {precision_score(y_val_encoded, y_pred_val, average="macro")}')
print(f'Train Recall: {recall_score(y_train_encoded, y_pred_train, average="macro")}')
print(f'Validation Recall: {recall_score(y_val_encoded, y_pred_val, average="macro")}')
print(f'Train F1 Score: {f1_score(y_train_encoded, y_pred_train, average="macro")}')
print(f'Validation F1 Score: {f1_score(y_val_encoded, y_pred_val, average="macro")}')
print(f'Train Accuracy: {accuracy_score(y_train_encoded, y_pred_train)}')
print(f'Validation Accuracy: {accuracy_score(y_val_encoded, y_pred_val)}')
print('\nValidation Classification Report:\n', classification_report(y_val_encoded, y_pred_val))


Train Precision: 0.6554909649581192
Validation Precision: 0.560800132751079
Train Recall: 0.8541698929490207
Validation Recall: 0.6262058715065237
Train F1 Score: 0.6941890537952454
Validation F1 Score: 0.5708202921505713
Train Accuracy: 0.9883755659892716
Validation Accuracy: 0.9832266080474051

Validation Classification Report:
               precision    recall  f1-score   support

           0       0.31      0.65      0.42       142
           1       1.00      1.00      1.00     38237
           2       0.03      0.16      0.04        19
           3       0.97      0.89      0.93      2612
           4       0.29      0.46      0.35       171
           5       0.77      0.60      0.67       671

    accuracy                           0.98     41852
   macro avg       0.56      0.63      0.57     41852
weighted avg       0.99      0.98      0.99     41852



In [None]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.metrics import precision_score, recall_score, accuracy_score, classification_report, f1_score
from sklearn.preprocessing import OneHotEncoder, StandardScaler, LabelEncoder
from sklearn.compose import ColumnTransformer
from sklearn.linear_model import LogisticRegression
from imblearn.over_sampling import SMOTE
from imblearn.under_sampling import RandomUnderSampler
from imblearn.pipeline import Pipeline as ImbPipeline

# Assuming 'df' is your DataFrame and 'ActivityType' is your target column

# Define preprocessing for numeric and categorical features
categorical_columns = df.drop('ActivityType', axis=1).select_dtypes(include=['object']).columns.tolist()
numeric_columns = df.drop('ActivityType', axis=1).select_dtypes(include=['int64', 'float64']).columns.tolist()

preprocessor = ColumnTransformer(transformers=[
    ('cat', OneHotEncoder(handle_unknown='ignore'), categorical_columns),
    ('num', StandardScaler(), numeric_columns)
], remainder='passthrough')

# Encode the target variable
label_encoder = LabelEncoder()
y_encoded = label_encoder.fit_transform(df['ActivityType'])

# Split the dataset into training and validation sets
X_train, X_val, y_train_encoded, y_val_encoded = train_test_split(
    df.drop('ActivityType', axis=1),  # Features
    y_encoded,  # Encoded target
    test_size=0.2,
    random_state=42
)

# Define the pipeline
pipeline = ImbPipeline(steps=[
    ('preprocessor', preprocessor),
    ('smote', SMOTE(random_state=42)),
    ('under', RandomUnderSampler(random_state=42)),
    ('model', LogisticRegression(random_state=42, max_iter=1000))
])

# Define the parameter grid
param_grid = {
    'model__C': [0.01, 0.1, 1, 10],
    'model__solver': ['liblinear', 'saga'],
    'smote__k_neighbors': [3, 5, 7]
}

# Initialize GridSearchCV
grid_search = GridSearchCV(pipeline, param_grid, cv=5, scoring='accuracy', n_jobs=-1)

# Fit the model using GridSearchCV
grid_search.fit(X_train, y_train_encoded)

# Best parameters
print("Best parameters found: ", grid_search.best_params_)

# Predictions with the best model from grid search
y_pred_train = grid_search.predict(X_train)
y_pred_val = grid_search.predict(X_val)

# Evaluation with the best model
print(f'Train Precision: {precision_score(y_train_encoded, y_pred_train, average="macro")}')
print(f'Validation Precision: {precision_score(y_val_encoded, y_pred_val, average="macro")}')
print(f'Train Recall: {recall_score(y_train_encoded, y_pred_train, average="macro")}')
print(f'Validation Recall: {recall_score(y_val_encoded, y_pred_val, average="macro")}')
print(f'Train F1 Score: {f1_score(y_train_encoded, y_pred_train, average="macro")}')
print(f'Validation F1 Score: {f1_score(y_val_encoded, y_pred_val, average="macro")}')
print(f'Train Accuracy: {accuracy_score(y_train_encoded, y_pred_train)}')
print(f'Validation Accuracy: {accuracy_score(y_val_encoded, y_pred_val)}')
print('\nValidation Classification Report:\n', classification_report(y_val_encoded, y_pred_val))


Best parameters found:  {'model__C': 1, 'model__solver': 'saga', 'smote__k_neighbors': 7}
Train Precision: 0.6574633037759763
Validation Precision: 0.5626240029068604
Train Recall: 0.8549934175520407
Validation Recall: 0.6293971167167508
Train F1 Score: 0.6974723980823683
Validation F1 Score: 0.5735453342894027
Train Accuracy: 0.9884651685124787
Validation Accuracy: 0.9833938640925165

Validation Classification Report:
               precision    recall  f1-score   support

           0       0.31      0.65      0.42       142
           1       1.00      1.00      1.00     38237
           2       0.03      0.16      0.05        19
           3       0.97      0.89      0.93      2612
           4       0.30      0.47      0.36       171
           5       0.77      0.61      0.68       671

    accuracy                           0.98     41852
   macro avg       0.56      0.63      0.57     41852
weighted avg       0.99      0.98      0.99     41852



# ensemble 2 with manual cleaning

In [None]:
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split
from sklearn.metrics import precision_score, recall_score, accuracy_score, classification_report, f1_score
from sklearn.preprocessing import OneHotEncoder, StandardScaler, LabelEncoder
from sklearn.compose import ColumnTransformer
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier, VotingClassifier
from sklearn.utils.class_weight import compute_class_weight
from imblearn.over_sampling import SMOTE
from imblearn.under_sampling import RandomUnderSampler
from imblearn.pipeline import Pipeline as ImbPipeline
from xgboost import XGBClassifier
import pandas as pd
import numpy as np
from collections import Counter

# Assuming df is your DataFrame containing the training data and df_test is the DataFrame for predictions

# Define preprocessing
categorical_columns = df.drop('ActivityType', axis=1).select_dtypes(include=['object']).columns.tolist()
numeric_columns = df.drop('ActivityType', axis=1).select_dtypes(include=['int64', 'float64']).columns.tolist()

preprocessor = ColumnTransformer(transformers=[
    ('cat', OneHotEncoder(handle_unknown='ignore'), categorical_columns),
    ('num', StandardScaler(), numeric_columns)
], remainder='passthrough')

label_encoder = LabelEncoder()
y_encoded = label_encoder.fit_transform(df['ActivityType'])
X_preprocessed = preprocessor.fit_transform(df.drop('ActivityType', axis=1))

# Splitting the data
X_train, X_val, y_train_encoded, y_val_encoded = train_test_split(X_preprocessed, y_encoded, test_size=0.2, random_state=42)

from collections import Counter

# Count the occurrences of each class in the target variable
class_counts = Counter(y_train_encoded)
sampling_strategy_smote = {class_label: int(count * 1.5) for class_label, count in class_counts.items() if count / len(y_train_encoded) < 0.91}
sampling_strategy_under = {class_label: int(count * 0.5) for class_label, count in class_counts.items() if count / len(y_train_encoded) >= 0.91}

# Compute class weights for imbalance handling
class_weights = compute_class_weight('balanced', classes=np.unique(y_train_encoded), y=y_train_encoded)
class_weights_dict = dict(enumerate(class_weights))

# Initialize estimators
estimator1 = RandomForestClassifier(n_estimators=200, max_depth=None, min_samples_leaf=1, min_samples_split=2, random_state=42, class_weight=class_weights_dict, n_jobs=-1 )
estimator2 = GradientBoostingClassifier(n_estimators=100, random_state=42)
estimator3 = XGBClassifier(random_state=42, use_label_encoder=False, colsample_bytree=0.7,
                           learning_rate=0.1, max_depth=5, n_estimators=200, subsample=0.7, eval_metric='mlogloss', n_jobs=-1 )
estimator4 = LogisticRegression(max_iter=1000, random_state=42)  # Adding LogisticRegression

# Create a VotingClassifier with soft voting
voting_clf = VotingClassifier(estimators=[
    ('rf', estimator1),
    ('gb', estimator2),
    ('xgb', estimator3),
    ('lr', estimator4)],  # Add LogisticRegression to the voting classifier
    voting='soft')


# Define the imbalance pipeline
imbalance_pipeline = ImbPipeline(steps=[
    ('smote', SMOTE(random_state=42, sampling_strategy=sampling_strategy_smote)),
    ('under', RandomUnderSampler(random_state=42, sampling_strategy=sampling_strategy_under)),
    ('voting', voting_clf)
])

# Train the model on the training set
imbalance_pipeline.fit(X_train, y_train_encoded)

# Predict on the validation set
y_pred_val_encoded = imbalance_pipeline.predict(X_val)

# Print evaluation metrics for the validation set
print(f'Precision: {precision_score(y_val_encoded, y_pred_val_encoded, average="macro")}')
print(f'Recall: {recall_score(y_val_encoded, y_pred_val_encoded, average="macro")}')
print(f'F1 Score: {f1_score(y_val_encoded, y_pred_val_encoded, average="macro")}')
print(f'Train Accuracy: {accuracy_score(y_train_encoded, imbalance_pipeline.predict(X_train))}')
print(f'Validation Accuracy: {accuracy_score(y_val_encoded, y_pred_val_encoded)}')
print(classification_report(y_val_encoded, y_pred_val_encoded))

# Predict on the test data using the trained pipeline
X_test_preprocessed = preprocessor.transform(df_test)
test_predictions_encoded = imbalance_pipeline.predict(X_test_preprocessed)
df_test['ActivityType'] = label_encoder.inverse_transform(test_predictions_encoded)

# Output the prediction in the desired format
output_df = df_test[['RecordID', 'ActivityType']]
output_df.to_csv('/content/drive/MyDrive/ccac_data/ensemble_predictions_cleaned_gs_lr.csv', index=False)
print(output_df.head())


# Best Model

In [None]:
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split
from sklearn.metrics import precision_score, recall_score, accuracy_score, classification_report, f1_score
from sklearn.preprocessing import OneHotEncoder, StandardScaler, LabelEncoder
from sklearn.compose import ColumnTransformer
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier, VotingClassifier
from sklearn.utils.class_weight import compute_class_weight
from imblearn.over_sampling import SMOTE
from imblearn.under_sampling import RandomUnderSampler
from imblearn.pipeline import Pipeline as ImbPipeline
from xgboost import XGBClassifier
import pandas as pd
import numpy as np
from collections import Counter

# Assuming df is your DataFrame containing the training data and df_test is the DataFrame for predictions

# Define preprocessing
categorical_columns = df.drop('ActivityType', axis=1).select_dtypes(include=['object']).columns.tolist()
numeric_columns = df.drop('ActivityType', axis=1).select_dtypes(include=['int64', 'float64']).columns.tolist()

preprocessor = ColumnTransformer(transformers=[
    ('cat', OneHotEncoder(handle_unknown='ignore'), categorical_columns),
    ('num', StandardScaler(), numeric_columns)
], remainder='passthrough')

label_encoder = LabelEncoder()
y_encoded = label_encoder.fit_transform(df['ActivityType'])
X_preprocessed = preprocessor.fit_transform(df.drop('ActivityType', axis=1))

# Splitting the data
X_train, X_val, y_train_encoded, y_val_encoded = train_test_split(X_preprocessed, y_encoded, test_size=0.2, random_state=42)

from collections import Counter

# Count the occurrences of each class in the target variable
class_counts = Counter(y_train_encoded)
sampling_strategy_smote = {class_label: int(count * 1.5) for class_label, count in class_counts.items() if count / len(y_train_encoded) < 0.91}
sampling_strategy_under = {class_label: int(count * 0.5) for class_label, count in class_counts.items() if count / len(y_train_encoded) >= 0.91}

# Compute class weights for imbalance handling
class_weights = compute_class_weight('balanced', classes=np.unique(y_train_encoded), y=y_train_encoded)
class_weights_dict = dict(enumerate(class_weights))

# Initialize estimators
estimator1 = RandomForestClassifier(n_estimators=200, max_depth=None, min_samples_leaf=1, min_samples_split=2, random_state=42, class_weight=class_weights_dict, n_jobs=-1 )
estimator2 = GradientBoostingClassifier(n_estimators=100, random_state=42)
estimator3 = XGBClassifier(random_state=42, use_label_encoder=False, colsample_bytree=0.7,
                           learning_rate=0.1, max_depth=5, n_estimators=200, subsample=0.7, eval_metric='mlogloss', n_jobs=-1 )
estimator4 = LogisticRegression(max_iter=1000, random_state=42)  # Adding LogisticRegression

# Create a VotingClassifier with soft voting
voting_clf = VotingClassifier(estimators=[
    ('rf', estimator1),
    ('gb', estimator2),
    ('xgb', estimator3),
    ('lr', estimator4)],  # Add LogisticRegression to the voting classifier
    voting='soft')


# Define the imbalance pipeline
imbalance_pipeline = ImbPipeline(steps=[
    ('smote', SMOTE(random_state=42, sampling_strategy=sampling_strategy_smote)),
    ('under', RandomUnderSampler(random_state=42, sampling_strategy=sampling_strategy_under)),
    ('voting', voting_clf)
])

# Train the model on the training set
imbalance_pipeline.fit(X_train, y_train_encoded)

# Predict on the validation set
y_pred_val_encoded = imbalance_pipeline.predict(X_val)

# Print evaluation metrics for the validation set
print(f'Precision: {precision_score(y_val_encoded, y_pred_val_encoded, average="macro")}')
print(f'Recall: {recall_score(y_val_encoded, y_pred_val_encoded, average="macro")}')
print(f'F1 Score: {f1_score(y_val_encoded, y_pred_val_encoded, average="macro")}')
print(f'Train Accuracy: {accuracy_score(y_train_encoded, imbalance_pipeline.predict(X_train))}')
print(f'Validation Accuracy: {accuracy_score(y_val_encoded, y_pred_val_encoded)}')
print(classification_report(y_val_encoded, y_pred_val_encoded))

# Predict on the test data using the trained pipeline
X_test_preprocessed = preprocessor.transform(df_test)
test_predictions_encoded = imbalance_pipeline.predict(X_test_preprocessed)
df_test['ActivityType'] = label_encoder.inverse_transform(test_predictions_encoded)

# Output the prediction in the desired format
output_df = df_test[['RecordID', 'ActivityType']]
output_df.to_csv('/content/drive/MyDrive/ccac_data/ensemble_predictions_cleaned_gs_lr.csv', index=False)
print(output_df.head())


Precision: 0.6352314079658715
Recall: 0.5178443845867483
F1 Score: 0.5339687439380824
Train Accuracy: 0.9943132265271256
Validation Accuracy: 0.9877903087068718
              precision    recall  f1-score   support

           0       0.54      0.19      0.28       142
           1       1.00      1.00      1.00     38237
           2       0.25      0.05      0.09        19
           3       0.92      0.96      0.94      2612
           4       0.41      0.09      0.14       171
           5       0.70      0.81      0.75       671

    accuracy                           0.99     41852
   macro avg       0.64      0.52      0.53     41852
weighted avg       0.99      0.99      0.99     41852



OSError: Cannot save file into a non-existent directory: '/kaggle/working'

In [None]:
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split
from sklearn.metrics import precision_score, recall_score, accuracy_score, classification_report, f1_score
from sklearn.preprocessing import OneHotEncoder, StandardScaler, LabelEncoder
from sklearn.compose import ColumnTransformer
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier, VotingClassifier
from sklearn.utils.class_weight import compute_class_weight
from imblearn.over_sampling import SMOTE
from imblearn.under_sampling import RandomUnderSampler
from imblearn.pipeline import Pipeline as ImbPipeline
from xgboost import XGBClassifier
import pandas as pd
import numpy as np
from collections import Counter

# Assuming df is your DataFrame containing the training data and df_test is the DataFrame for predictions

# Define preprocessing
categorical_columns = df.drop('ActivityType', axis=1).select_dtypes(include=['object']).columns.tolist()
numeric_columns = df.drop('ActivityType', axis=1).select_dtypes(include=['int64', 'float64']).columns.tolist()

preprocessor = ColumnTransformer(transformers=[
    ('cat', OneHotEncoder(handle_unknown='ignore'), categorical_columns),
    ('num', StandardScaler(), numeric_columns)
], remainder='passthrough')

label_encoder = LabelEncoder()
y_encoded = label_encoder.fit_transform(df['ActivityType'])
X_preprocessed = preprocessor.fit_transform(df.drop('ActivityType', axis=1))

# Splitting the data
X_train, X_val, y_train_encoded, y_val_encoded = train_test_split(X_preprocessed, y_encoded, test_size=0.2, random_state=42)

from collections import Counter

# Count the occurrences of each class in the target variable
class_counts = Counter(y_train_encoded)
sampling_strategy_smote = {class_label: int(count * 1.5) for class_label, count in class_counts.items() if count / len(y_train_encoded) < 0.91}
sampling_strategy_under = {class_label: int(count * 0.5) for class_label, count in class_counts.items() if count / len(y_train_encoded) >= 0.91}

# Compute class weights for imbalance handling
class_weights = compute_class_weight('balanced', classes=np.unique(y_train_encoded), y=y_train_encoded)
class_weights_dict = dict(enumerate(class_weights))

# Initialize estimators
estimator1 = RandomForestClassifier(n_estimators=200, max_depth=None, min_samples_leaf=1, min_samples_split=2, random_state=42, class_weight=class_weights_dict, n_jobs=-1 )
estimator2 = GradientBoostingClassifier(n_estimators=100, random_state=42)
estimator3 = XGBClassifier(random_state=42, use_label_encoder=False, colsample_bytree=0.7,
                           learning_rate=0.1, max_depth=5, n_estimators=200, subsample=0.7, eval_metric='mlogloss', n_jobs=-1 )
estimator4 = LogisticRegression(max_iter=1000, random_state=42)  # Adding LogisticRegression

# Create a VotingClassifier with soft voting
voting_clf = VotingClassifier(estimators=[
    ('rf', estimator1),
    ('gb', estimator2),
    ('xgb', estimator3),
    ('lr', estimator4)],  # Add LogisticRegression to the voting classifier
    voting='soft',weights=[1, 1, 1,2])


# Define the imbalance pipeline
imbalance_pipeline = ImbPipeline(steps=[
    ('smote', SMOTE(random_state=42, sampling_strategy=sampling_strategy_smote)),
    ('under', RandomUnderSampler(random_state=42, sampling_strategy=sampling_strategy_under)),
    ('voting', voting_clf,)
])

# Train the model on the training set
imbalance_pipeline.fit(X_train, y_train_encoded)

# Predict on the validation set
y_pred_val_encoded = imbalance_pipeline.predict(X_val)

# Print evaluation metrics for the validation set
print(f'Precision: {precision_score(y_val_encoded, y_pred_val_encoded, average="macro")}')
print(f'Recall: {recall_score(y_val_encoded, y_pred_val_encoded, average="macro")}')
print(f'F1 Score: {f1_score(y_val_encoded, y_pred_val_encoded, average="macro")}')
print(f'Train Accuracy: {accuracy_score(y_train_encoded, imbalance_pipeline.predict(X_train))}')
print(f'Validation Accuracy: {accuracy_score(y_val_encoded, y_pred_val_encoded)}')
print(classification_report(y_val_encoded, y_pred_val_encoded))

# Predict on the test data using the trained pipeline
X_test_preprocessed = preprocessor.transform(df_test)
test_predictions_encoded = imbalance_pipeline.predict(X_test_preprocessed)
df_test['ActivityType'] = label_encoder.inverse_transform(test_predictions_encoded)

# Output the prediction in the desired format
output_df = df_test[['RecordID', 'ActivityType']]
output_df.to_csv('/content/drive/MyDrive/ccac_data/ensemble_best_exp1.csv', index=False)
print(output_df.head())


Precision: 0.6950321658053565
Recall: 0.525721131346141
F1 Score: 0.5464248051206169
Train Accuracy: 0.9941877829946358
Validation Accuracy: 0.9882203956800153
              precision    recall  f1-score   support

           0       0.58      0.18      0.27       142
           1       1.00      1.00      1.00     38237
           2       0.50      0.05      0.10        19
           3       0.92      0.96      0.94      2612
           4       0.46      0.13      0.21       171
           5       0.71      0.83      0.76       671

    accuracy                           0.99     41852
   macro avg       0.70      0.53      0.55     41852
weighted avg       0.99      0.99      0.99     41852

   RecordID ActivityType
0     26355  No Activity
1     26641  No Activity
2     26836  No Activity
3     26900  No Activity
4     26948  No Activity
