In [None]:
# Install required libraries
# You can install these packages using pip if not installed:
# !pip install pandas sqlite3 pandasql

import sqlite3
import pandas as pd

# Set working directory
import os
os.chdir("/Users/jade/Desktop/Humana/Training")

# Load datasets using pandas
features_data = pd.read_csv("Afeatures.csv")
control_data = pd.read_csv("Controlpoint.csv")
cost_data = pd.read_csv("CostUt.csv")
demograph_data = pd.read_csv("Demographics.csv")
condition_data = pd.read_csv("mcondition.csv")
detail_data = pd.read_csv("mdetail.csv")
claims_data = pd.read_csv("mclaims.csv")
target_data = pd.read_csv("Tmembers.csv")
members_data = pd.read_csv("mdata.csv")
pharmacy_data = pd.read_csv("PU.csv")
quality_data = pd.read_csv("QD.csv")
sales_data = pd.read_csv("SC.csv")
social_data = pd.read_csv("Socialh.csv")
web_data = pd.read_csv("WA.csv")

In [2]:
import re
def convert_tenure(value):
    # Check if value is a string before processing
    if isinstance(value, str):
        # If the format is 'X - Y YEARS', calculate the midpoint
        if '-' in value and 'YEARS' in value:
            numbers = re.findall(r'\d*\.?\d+', value)
            if len(numbers) == 2:  # Ensure there are two numbers for the range
                return (float(numbers[0]) + float(numbers[1])) / 2
        # If the format is 'X+ YEARS', take the number before the '+'
        elif '+' in value and 'YEARS' in value:
            numbers = re.findall(r'\d*\.?\d+', value)
            if len(numbers) == 1:  # Ensure there's at least one number
                return float(numbers[0])
        # If there's just 'X YEARS', return the number
        elif 'YEARS' in value:
            numbers = re.findall(r'\d*\.?\d+', value)
            if len(numbers) == 1:
                return float(numbers[0])
        # Default case for strings that don't match the expected format
        return None
    # If value is already a float or an int, return it as is
    elif isinstance(value, (float, int)):
        return value
    # Default return for unexpected types (e.g., NaN, None)
    return None

# Applying the function to the column
members_data['tenure_band'] = members_data['tenure_band'].apply(convert_tenure)
members_data['disabled_ind'] = members_data['disabled_ind'].replace({'Y': 1, 'N': 0})
members_data['dual_eligible_ind'] = members_data['dual_eligible_ind'].replace({'Y': 1, 'N': 0})
members_data['lis_ind']= members_data['lis_ind'].replace({'Y': 1, 'N': 0})

In [3]:
from sklearn.feature_selection import VarianceThreshold
import numpy as np

# Function to remove high correlation and low variance columns
def clean_data(df, variance_threshold=0.01, correlation_threshold=0.9):
    # Step 1: Remove variables with low variance
    selector = VarianceThreshold(threshold=variance_threshold)
    df_reduced = selector.fit_transform(df)
    
    # Convert back to DataFrame to handle column names
    columns_kept = df.columns[selector.get_support()]
    df_filtered = pd.DataFrame(df_reduced, columns=columns_kept)

    # Step 2: Remove highly correlated variables
    corr_matrix = df_filtered.corr().abs()
    upper = corr_matrix.where(np.triu(np.ones(corr_matrix.shape), k=1).astype(bool))
    to_drop = [column for column in upper.columns if any(upper[column] > correlation_threshold)]
    
    # Drop highly correlated columns
    df_cleaned = df_filtered.drop(columns=to_drop)
    
    return df_cleaned

# Assuming your data is already loaded in these variables
# Apply the cleaning process to each dataset
members_data_cleaned = clean_data(members_data)
pharmacy_data_cleaned = clean_data(pharmacy_data)
social_data_cleaned = clean_data(social_data)
web_data_cleaned = clean_data(web_data)

In [None]:
merged_data_cleaned = pd.merge(members_data_cleaned, pharmacy_data_cleaned, on='id', how='inner')
merged_data_cleaned = pd.merge(merged_data, social_data_cleaned, on='id', how='inner')
merged_data_cleaned = pd.merge(merged_data, web_data_cleaned, on='id', how='inner')

In [5]:
merged_data = pd.merge(members_data, pharmacy_data, on='id', how='inner')
merged_data = pd.merge(merged_data, social_data, on='id', how='inner')
merged_data = pd.merge(merged_data, web_data, on='id', how='inner')

In [6]:
new_target=target_data.drop(['product_type','calendar_year','plan_category'],axis=1)
merged_data = pd.merge(merged_data, new_target, on='id', how='inner')

In [None]:
datasets = {
    'features_data': features_data,
    'control_data': control_data,
    'cost_data': cost_data,
    'demograph_data': demograph_data,
    'condition_data': condition_data,
    'detail_data': detail_data,
    'claims_data': claims_data,
    'target_data': target_data,
    'members_data': members_data,
    'pharmacy_data': pharmacy_data,
    'quality_data': quality_data,
    'sales_data': sales_data,
    'social_data': social_data,
    'web_data': web_data
}

# Check number of rows for each dataset
for name, data in datasets.items():
    print(f"{name} has {len(data)} rows")
    print(f"{name} has {data.info()} rows")

In [45]:
#condition_data
#cliams_data
#quality_data
features_data 
control_data 
cost_data 
demograph_data #deleted two columns 
#detail_data  
target_data 
members_data 
pharmacy_data 
#sales_data 
social_data 
web_data 

In [7]:
merged_data = pd.merge(merged_data, features_data, on='id', how='inner')
merged_data = pd.merge(merged_data, control_data, on='id', how='inner')
merged_data = pd.merge(merged_data, cost_data, on='id', how='inner')

In [None]:
print(merged_data.info())

In [102]:
demograph_data = pd.read_csv("Demographics.csv")

In [9]:
demograph_data=demograph_data.drop(columns=['lang_spoken_cd', 'rucc_category'])

In [10]:
merged_data = pd.merge(merged_data, demograph_data, on='id', how='inner')

In [None]:
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier
from sklearn.model_selection import train_test_split
from sklearn.impute import SimpleImputer
from sklearn.decomposition import PCA

# Assuming merged_data is already available

# Step 1: Prepare the data
X = merged_data.drop(columns=['target_column'])  # Features
y = merged_data['target_column']  # Target

# Step 2: Impute missing values
imputer = SimpleImputer(strategy='median')
X_imputed = imputer.fit_transform(X)

# Step 3: Dimensionality reduction (optional) - reduce to 10 components
pca = PCA(n_components=10)
X_pca = pca.fit_transform(X_imputed)

# Step 4: Split the data into training and testing sets (80% train, 20% test)
X_train, X_test, y_train, y_test = train_test_split(X_pca, y, test_size=0.2, random_state=42)

# Step 5: Train Random Forest with fewer estimators and use parallel processing
rf_model = RandomForestClassifier(n_estimators=50, n_jobs=-1, random_state=42)
rf_model.fit(X_train, y_train)

# Step 6: Evaluate performance
y_pred_rf = rf_model.predict(X_test)
rf_acc = accuracy_score(y_test, y_pred_rf)
print("Random Forest Accuracy:", rf_acc)