# SIMULATING DATA BASED ON STATISTICAL DISTRIBUTION/ PROPERTIES

In [None]:
import numpy as np
import pandas as pd

In [None]:


# Load the adult income dataset
df = pd.read_csv('https://archive.ics.uci.edu/ml/machine-learning-databases/adult/adult.data', header=None)

# Define column names
columns = ['age', 'workclass', 'fnlwgt', 'education', 'education.num', 'marital.status', 'occupation', 'relationship', 'race', 'sex', 'capital.gain', 'capital.loss', 'hours.per.week', 'native.country', 'income']

# Assign column names to dataframe
df.columns = columns

# Subset the data to include only the columns we are interested in
cols = ['age', 'workclass', 'fnlwgt', 'education', 'education.num', 'marital.status', 'occupation', 'relationship', 'race', 'sex', 'capital.gain', 'capital.loss', 'hours.per.week', 'native.country', 'income']
df = df[cols]

# Calculate the means and standard deviations of the numerical columns
num_cols = ['age', 'fnlwgt', 'education.num', 'capital.gain', 'capital.loss', 'hours.per.week']
means = df[num_cols].mean()
stds = df[num_cols].std()

# Calculate the minority class size and majority class size based on 5% and 95% ratio respectively
minority_class_size = 0.05 * 20000
majority_class_size = 20000 - minority_class_size

# Simulate minority class
minority_data = pd.DataFrame()
for col in df.columns:
    if col in num_cols:
        minority_data[col] = np.random.normal(means[col], stds[col], int(minority_class_size))
    else:
        minority_data[col] = np.random.choice(df[col].unique(), int(minority_class_size))

minority_data['income_binary'] = 1

# Simulate majority class
majority_data = pd.DataFrame()
for col in df.columns:
    if col in num_cols:
        majority_data[col] = np.random.normal(means[col], stds[col], int(majority_class_size))
    else:
        majority_data[col] = np.random.choice(df[col].unique(), int(majority_class_size))

majority_data['income_binary'] = 0

# Concatenate minority and majority data to get the final dataset
sim_data = pd.concat([minority_data, majority_data], ignore_index=True)

# Shuffle the rows of the dataset
sim_data = sim_data.sample(frac=1, random_state=42).reset_index(drop=True)

# Save the simulated dataset as a CSV file
sim_data.to_csv('simulated_data.csv', index=False)


In [None]:
df = sim_data

In [None]:
from sklearn.model_selection import train_test_split

# Subset the data to include only the columns we are interested in
cols = ['age', 'workclass', 'fnlwgt', 'education', 'education.num', 'marital.status', 'occupation', 'relationship', 'race', 'sex', 'capital.gain', 'capital.loss', 'hours.per.week', 'native.country', 'income', 'income_binary']
df = df[cols]

# Drop the 'income' column
df.drop('income', axis=1, inplace=True)

# Check for missing values
print(df.isna().sum())



# Convert the 'income_binary' column to integer type
df['income_binary'] = df['income_binary'].astype(int)





In [None]:
data = df

In [None]:
from imblearn.over_sampling import RandomOverSampler

# Splitting the data into X and y
X = df.drop('income_binary', axis=1)
y = df['income_binary']

# Applying Random Oversampling
oversample = RandomOverSampler(sampling_strategy='minority')
X_over, y_over = oversample.fit_resample(X, y)

# Counting the values of both classes after oversampling
y_over.value_counts()


In [None]:
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.svm import SVC
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, roc_auc_score

# Splitting the data into X and y
X = df.drop('income_binary', axis=1)
y = df['income_binary']

# Splitting the data into train and test sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

numeric_transformer = StandardScaler()
categorical_transformer = OneHotEncoder(handle_unknown='ignore')

preprocessor = ColumnTransformer(
    transformers=[
        ('num', numeric_transformer, ['age', 'fnlwgt', 'education.num', 'capital.gain', 'capital.loss', 'hours.per.week']),
        ('cat', categorical_transformer, ['workclass', 'education', 'marital.status', 'occupation', 
                                          'relationship', 'race', 'sex', 'native.country'])])
# Defining the pipelines for each algorithm
lr_pipeline = Pipeline([
    ('preprocessor', preprocessor),
    ('clf', LogisticRegression(random_state=42))
])

rf_pipeline = Pipeline([
    ('preprocessor', preprocessor),
    ('clf', RandomForestClassifier(random_state=42))
])

svm_pipeline = Pipeline([
    ('preprocessor', preprocessor),
    ('clf', SVC(random_state=42))
])

# Fitting the pipelines
lr_pipeline.fit(X_train, y_train)
rf_pipeline.fit(X_train, y_train)
svm_pipeline.fit(X_train, y_train)

# Making predictions
y_pred_lr = lr_pipeline.predict(X_test)
y_pred_rf = rf_pipeline.predict(X_test)
y_pred_svm = svm_pipeline.predict(X_test)

# Evaluating the models
print("Logistic Regression:")
print("Accuracy:", accuracy_score(y_test, y_pred_lr))
print("Precision:", precision_score(y_test, y_pred_lr))
print("Recall:", recall_score(y_test, y_pred_lr))
print("F1 Score:", f1_score(y_test, y_pred_lr))
print("ROC AUC Score:", roc_auc_score(y_test, y_pred_lr))

print("\nRandom Forest:")
print("Accuracy:", accuracy_score(y_test, y_pred_rf))
print("Precision:", precision_score(y_test, y_pred_rf))
print("Recall:", recall_score(y_test, y_pred_rf))
print("F1 Score:", f1_score(y_test, y_pred_rf))
print("ROC AUC Score:", roc_auc_score(y_test, y_pred_rf))

print("\nSupport Vector Machine:")
print("Accuracy:", accuracy_score(y_test, y_pred_svm))
print("Precision:", precision_score(y_test, y_pred_svm))
print("Recall:", recall_score(y_test, y_pred_svm))
print("F1 Score:", f1_score(y_test, y_pred_svm))
print("ROC AUC Score:", roc_auc_score(y_test, y_pred_svm))


In [None]:
## Grouping Data Basis features

cms_patient_data_cnt = cms_patient_data[['BeneID', 'ClaimID']].groupby(cms_patient_data['Provider']).nunique().reset_index()
cms_patient_data_cnt.rename(columns={'BeneID':'BeneID_count','ClaimID':'ClaimID_count'},inplace=True)

In [None]:
import pandas as pd
import io
from cryptography.fernet import Fernet

# Define your tabular data as a Pandas DataFrame
data = pd.DataFrame({
    'name': ['Alice', 'Bob', 'Charlie'],
    'age': [25, 30, 35],
    'salary': [50000, 60000, 70000]
})

# Generate a key
key = Fernet.generate_key()

# Create a Fernet object with the key
cipher_suite = Fernet(key)

# Encrypt the data
encrypted_data = {}
for column_name in data.columns:
    column_data = data[column_name].astype(str).tolist()
    encrypted_column_data = [cipher_suite.encrypt(str.encode(value)) for value in column_data]
    encrypted_data[column_name] = encrypted_column_data

# Store the encrypted data
encrypted_data_df = pd.DataFrame(encrypted_data)

# Decrypt the data
decrypted_data = {}
for column_name in encrypted_data_df.columns:
    encrypted_column_data = encrypted_data_df[column_name].tolist()
    decrypted_column_data = [cipher_suite.decrypt(value).decode('utf-8') for value in encrypted_column_data]
    decrypted_data[column_name] = decrypted_column_data

# Store the decrypted data
decrypted_data_df = pd.DataFrame(decrypted_data)

print(decrypted_data_df)


In [None]:
import pandas as pd
import io
from cryptography.fernet import Fernet

# Generate a key
key = Fernet.generate_key()

# Create a Fernet object with the key
cipher_suite = Fernet(key)

# Define a function to encrypt a DataFrame
def encrypt_dataframe(df, key):
    # Convert the DataFrame to a CSV string
    csv_string = df.to_csv(index=False)

    # Encrypt the CSV string
    encrypted_csv_string = cipher_suite.encrypt(csv_string.encode())

    # Convert the encrypted CSV string back to a DataFrame
    encrypted_df = pd.read_csv(io.StringIO(encrypted_csv_string.decode()))

    return encrypted_df

# Define a function to decrypt a DataFrame
def decrypt_dataframe(df, key):
    # Convert the DataFrame to a CSV string
    csv_string = df.to_csv(index=False)

    # Decrypt the CSV string
    decrypted_csv_string = cipher_suite.decrypt(csv_string.encode())

    # Convert the decrypted CSV string back to a DataFrame
    decrypted_df = pd.read_csv(io.StringIO(decrypted_csv_string.decode()))

    return decrypted_df

# Example usage
df = pd.DataFrame({'A': [1, 2, 3], 'B': ['foo', 'bar', 'baz']})

# Encrypt the DataFrame
encrypted_df = encrypt_dataframe(df, key)
print(encrypted_df)

# Decrypt the DataFrame
decrypted_df = decrypt_dataframe(encrypted_df, key)
print(decrypted_df)


In [None]:
import pandas as pd
import numpy as np
import os
import matplotlib.pyplot as plt
import warnings
warnings.filterwarnings("ignore")

In [None]:
#import matplotlib.pyplot as plt  
from pandas import DataFrame, Series
import datetime as DT
import json
import re
from sklearn.preprocessing import OneHotEncoder
from sklearn.preprocessing import LabelEncoder
from sklearn.preprocessing import StandardScaler
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import train_test_split
from sklearn.tree import DecisionTreeClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.neural_network import MLPClassifier
from xgboost import XGBClassifier
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.metrics import ConfusionMatrixDisplay, accuracy_score, classification_report,confusion_matrix



import pickle

import time


In [None]:
os.chdir('C:/Users/p_adi/Downloads')

In [None]:
data1 = pd.read_csv('relpase_data.csv')
data1.head()

In [None]:
def extract_numeric_part(df_col):
    # regular expression to extract numeric part
    pattern = re.compile(r'\s*(\d+)\s*')

    # iterate over the column and extract numeric part
    numeric_part = []
    for val in df_col:
        if isinstance(val, str):
            match = pattern.search(val)
            if match:
                numeric_part.append(int(match.group(1)))
            else:
                numeric_part.append(None)
        elif isinstance(val, int):
            numeric_part.append(val)
        else:
            numeric_part.append(None)

    # return the new column as a pandas Series
    return pd.Series(numeric_part)

data1['weight'] = extract_numeric_part(data1['weight'])

In [None]:
data1['label'].value_counts()

In [None]:
data1['care_level'].value_counts()

In [None]:
data1['multiple_occurences'].value_counts()


## What is the meaning of different LOS?

In [None]:
cat_var_counts = data1['multiple_occurences'].value_counts()
cat_var_percentages = cat_var_counts / len(data1) * 100

# Create a horizontal bar chart with counts and percentages
fig, ax = plt.subplots()
ax.barh(cat_var_counts.index, cat_var_counts.values)
ax.set_xlabel('Count')
ax2 = ax.twiny()
ax2.barh(cat_var_percentages.index, cat_var_percentages.values, alpha=0.4, color='g')
ax2.set_xlabel('Percentage')
plt.show()

In [None]:
data1.isna().sum()

In [None]:
data1 = data1.fillna("")

In [None]:
data1.isna().sum()

In [None]:
data1.columns

In [None]:
data1.info()

In [None]:
import seaborn as sns
import matplotlib.pyplot as plt

# create a figure and axes objects
fig, axs = plt.subplots(nrows=2, ncols=1, figsize=(8, 10))

# plot a boxplot of the data on the first axis
sns.boxplot(data=data1, x='weight', ax=axs[0])
axs[0].set_title('Box plot of weight')

# plot a histogram of the data on the second axis
sns.histplot(data=data1, x='weight', bins=50, ax=axs[1])
axs[1].set_title('Histogram of weight')

# adjust the layout and show the plot
fig.tight_layout()
plt.show()


In [None]:
import seaborn as sns
import matplotlib.pyplot as plt

# create a figure and axes objects
fig, axs = plt.subplots(nrows=2, ncols=1, figsize=(8, 10))

# plot a boxplot of the data on the first axis
sns.boxplot(data=data1, x='weight', ax=axs[0])
axs[0].set_title('Box plot of weight')

# plot a histogram of the data on the second axis
sns.histplot(data=data1, x='weight', bins=range(1000, 20001, 1000), ax=axs[1])
axs[1].set_title('Histogram of weight')

# adjust the layout and show the plot
fig.tight_layout()
plt.show()


In [None]:
import seaborn as sns
import matplotlib.pyplot as plt
import numpy as np

# create a figure and axes objects
fig, axs = plt.subplots(nrows=2, ncols=1, figsize=(8, 10))

# plot a boxplot of the data on the first axis
sns.boxplot(data=data1, x='weight', ax=axs[0])
axs[0].set_title('Box plot of weight')

# plot a distplot of the data after logarithmic transformation on the second axis
sns.distplot(np.log(data1['weight']), bins=range(0, 10), ax=axs[1])
axs[1].set_title('Distribution plot of log(weight)')

# adjust the layout and show the plot
fig.tight_layout()
plt.show()



In [None]:
data1['target'] = data1['multiple_occurences'].apply(lambda x : 1 if x == 'Yes' else 0)

In [None]:
data1['target'].value_counts()

In [None]:
dtype_conv_features= ['cpt_code', 'rev_code', 'diagcode', 'care_level', 'label']

ohe_features = ['cpt_code', 'rev_code', 'diagcode', 'care_level', 'label']

le_features = ['cpt_code', 'rev_code', 'diagcode', 'care_level', 'label']

scaling_features = ['weight', 'height', 'location_los']

percent_value = 85

In [None]:
def set_str(df,conv_features=[]): 
    for feature in conv_features:
        df[feature]= df[feature].astype(str)
    return df

In [None]:
def set_engineered_features(df,engineered_features):
    for ef, fun in engineered_features.items():
        method_apply = fun
        df = method_apply(df)

    return df

In [None]:
def feature_scaler(in_df, feature=[]):
    #for feature in col_list:
        scaler = StandardScaler()
        scaling_cols = in_df[feature]
        scaler.fit(scaling_cols.values)
        scaling_cols = scaler.transform(scaling_cols.values)
        in_df[feature] = scaling_cols
    # scaled = scaler.fit_transform(col_df).toarray()
        print('#'*10, f'scaled: {feature}', '#'*10)
        return in_df

In [None]:
def calculate_percent_row_class(df, feature, percent_value):
    percentage_row_count=df[feature].value_counts(normalize=True).mul(100)
    cumsum_percentage_count = percentage_row_count.cumsum()
    d = cumsum_percentage_count[cumsum_percentage_count<=percent_value]
    if len(d)!=0 and d[len(d)-1]<percent_value:
        length = len(d)
        m = cumsum_percentage_count[cumsum_percentage_count==cumsum_percentage_count[length]]
        category_list =  list(d.index)+ list(m.index)
    elif len(d)== 0 :
        m = cumsum_percentage_count[cumsum_percentage_count==cumsum_percentage_count[0]]
        category_list =  list(m.index)
    else:
        category_list =  list(d.index)
    return category_list

In [None]:
def label_encoder(in_df,col_name_with_threshold=[], percent_value= percent_value):

    label_map = {}
    for col_name in col_name_with_threshold:
        val_cnt_df = in_df[col_name].value_counts()
        # if count_threshold != 0:
        #     series = pd.value_counts(in_df[col_name])
        #     mask = series.lt(count_threshold)
        in_df[col_name] = np.where(in_df[col_name].isin(
        calculate_percent_row_class(in_df,col_name,percent_value)),  in_df[col_name], 'Other')
        
            

        encoder = LabelEncoder()
        encoder.fit(in_df[col_name])
        in_df[col_name] = encoder.transform(in_df[col_name])
        
        label_codes = np.arange(0,len(encoder.classes_),1)
        
        # a = encoder.inverse_transform(label_codes)
        # print(a)
        le_map = dict(zip(encoder.inverse_transform(label_codes),label_codes))
        # print(le_map)
        # pd.DataFrame.from_dict({col_name: list(a), col_name+'_map':list(label_codes)}).to_csv(col_name+'.csv')
        
        label_map[col_name] = le_map

        print('#'*10, f'label encoded: {col_name}', '#'*10)

    return label_map, in_df

In [None]:
def preprocessing_deniedApproved_model(df, le_features, ohe_features, scaling_features, dtype_conv_features):
    Label_map = {}
    df = set_str(df, dtype_conv_features)

    # df,dc_map = set_denialcode_as_target(df)
    # df = feature_scaler(df,scaling_features)
    Label_map, df = label_encoder(df, le_features)
    #df = ohe_encoder(df, ohe_features)
    # for i in ohe_features:
    #     encoded_df = ohe_encoder(df, i)
    #     df.drop([i],axis=1,inplace=True)
    #     df = df.merge(encoded_df,left_index=True,right_index=True)
    #     print(f"After ohe of {i} shape:", df.shape)
    out_df = pd.get_dummies(df, columns = ohe_features)
    print(" shape:", out_df.shape)
    return out_df,Label_map
    # return df
    
df_combined,Label_map = preprocessing_deniedApproved_model(data1, le_features, ohe_features,  scaling_features, dtype_conv_features)
df_combined.shape

In [None]:
import seaborn as sns
import matplotlib.pyplot as plt


fig, ax = plt.subplots(figsize=(14, 12))
sns.distplot(df_combined['height'], ax=ax)

# Set the size of the plot


# Show the plot
plt.show()

In [None]:
fig, ax = plt.subplots(figsize=(14, 12))
sns.boxplot(df_combined['height'], ax = ax)
plt.show()

In [None]:
fig, ax = plt.subplots(figsize=(14, 12))
sns.distplot(df_combined['location_los'])
plt.show()

In [None]:
fig, ax = plt.subplots(figsize=(14, 12))
sns.boxplot(df_combined['location_los'])
plt.show()

In [None]:
import scipy.stats as stats

# Create a DataFrame with numeric data


# Create a Q-Q plot of the data
stats.probplot(df_combined['location_los'], dist='norm', plot=plt)

# Show the plot
plt.show()

In [None]:
import scipy.stats as stats

# Create a DataFrame with numeric data


# Create a Q-Q plot of the data
stats.probplot(df_combined['weight'], dist='norm', plot=plt)

# Show the plot
plt.show()

In [None]:
import scipy.stats as stats

# Create a DataFrame with numeric data


# Create a Q-Q plot of the data
stats.probplot(df_combined['height'], dist='norm', plot=plt)

# Show the plot
plt.show()

In [None]:
data1.drop(['patientmasterkey','multiple_occurences'], axis=1, inplace = True)

In [None]:
# import pandas as pd
# from sklearn.compose import ColumnTransformer
# from sklearn.ensemble import RandomForestClassifier
# from sklearn.model_selection import GridSearchCV, train_test_split
# from sklearn.preprocessing import OneHotEncoder, StandardScaler


# # Define column transformers
# numeric_transformer = StandardScaler()
# categorical_transformer = OneHotEncoder(drop="first")

# # Encode categorical variables and scale numeric variables
# ct = ColumnTransformer(
#     transformers=[
#         ("num", numeric_transformer, ['weight', 'height', 'location_los']),
#         ("cat", categorical_transformer, ['cpt_code', 'rev_code', 'diagcode', 'care_level', 'label'])
#     ])

# # Transform the data
# X = data1.drop("target", axis=1)
# y = data1["target"]
# X = ct.fit_transform(X)

# # Split the data into training and testing sets
# X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)

# # Train the Random Forest Classifier using Grid Search
# param_grid = {
#     "n_estimators": [100, 300],
#     "max_depth": [5, 10, 20],
#     "min_samples_split": [2, 5],
#     "min_samples_leaf": [1, 2, 4, 6],
#     "max_features": ["sqrt"]
# }
# rf = RandomForestClassifier(random_state=42)
# grid_search = GridSearchCV(estimator=rf, param_grid=param_grid, cv=3)
# grid_search.fit(X_train, y_train)

# # Print the best parameters and the best score obtained
# print("Best Parameters:", grid_search.best_params_)
# print("Best Score:", grid_search.best_score_)

# # Evaluate the model on the testing data
# from sklearn.metrics import accuracy_score
# y_pred = grid_search.predict(X_test)
# accuracy = accuracy_score(y_test, y_pred)
# print("Accuracy:", accuracy)


In [None]:
import pandas as pd
from sklearn.compose import ColumnTransformer
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import GridSearchCV, train_test_split
from sklearn.preprocessing import OneHotEncoder, StandardScaler



# Define column transformers
numeric_transformer = StandardScaler()
categorical_transformer = OneHotEncoder(drop="first")

# Encode categorical variables and scale numeric variables
ct = ColumnTransformer(
    transformers=[
        ("num", numeric_transformer, ['weight', 'height', 'location_los']),
        ("cat", categorical_transformer, ['cpt_code', 'rev_code', 'diagcode', 'care_level', 'label'])
    ])

# Split the data into X and y
X = data1.drop("target", axis=1)
y = data1["target"]

# Transform the data
X = ct.fit_transform(X)

# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)

# Train the Random Forest Classifier using Grid Search
param_grid = {
    "n_estimators": [100, 300],
    "max_depth": [5, 10, 20],
    "min_samples_split": [2, 5],
    "min_samples_leaf": [1, 2, 4, 6],
    "max_features": ["sqrt"]
}
rf = RandomForestClassifier(random_state=42)
grid_search = GridSearchCV(estimator=rf, param_grid=param_grid, cv=3, n_jobs=-1)
grid_search.fit(X_train, y_train)

# Print the best parameters and the best score obtained
print("Best Parameters:", grid_search.best_params_)
print("Best Score:", grid_search.best_score_)

# Evaluate the model on the testing data
from sklearn.metrics import accuracy_score
y_pred = grid_search.predict(X_test)
accuracy = accuracy_score(y_test, y_pred)
print("Accuracy:", accuracy)


In [None]:
data2 = data1
data2.columns

In [None]:
# Define column transformers
numeric_transformer = StandardScaler()
categorical_transformer = OneHotEncoder(drop="first")

# Encode categorical variables and scale numeric variables
ct = ColumnTransformer(
    transformers=[
        ("num", numeric_transformer, ['weight', 'height']),
        ("cat", categorical_transformer, ['cpt_code',  'diagcode', 'care_level', 'label'])
    ])

In [None]:
# Split the data into X and y
X = data2.drop("target", axis=1)
y = data2["target"]

# Transform the data
X = ct.fit_transform(X)

# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)

In [None]:
# Train the Random Forest Classifier using Grid Search
param_grid = {
    "n_estimators": [100, 300],
    "max_depth": [5, 10, 20],
    "min_samples_split": [2, 5],
    "min_samples_leaf": [1, 2, 4, 6],
    "max_features": ["sqrt"]
}
rf = RandomForestClassifier(random_state=42)
grid_search = GridSearchCV(estimator=rf, param_grid=param_grid, cv=3, n_jobs=-1)
grid_search.fit(X_train, y_train)



In [None]:
# Print the best parameters and the best score obtained
print("Best Parameters:", grid_search.best_params_)
print("Best Score:", grid_search.best_score_)

# Evaluate the model on the testing data
from sklearn.metrics import accuracy_score
y_pred = grid_search.predict(X_test)
accuracy = accuracy_score(y_test, y_pred)
print("Accuracy:", accuracy)


In [None]:
from numpy.random import randn
from numpy import percentile

def outlier_treatment(datacolumn):
    
    q25, q75, q50 = percentile(datacolumn, 25), percentile(datacolumn, 75), percentile(datacolumn, 50)
    iqr = q75 - q25

    # calculate the outlier cutoff
    # cut_off = iqr * 1.5
    # extreme outliers - >
    cut_off = iqr * 3
    lower, upper = q25 - cut_off, q75 + cut_off
    return iqr, lower, upper, q25, q75, q50

In [None]:
#Treat  for outliers - 
def checkOutlier(data, checkOutlierCol):
        # for count in range(0,len(checkOutlierCol)) :
            # print(count)
            iqr, lower, upper, q25, q75, q50 = outlier_treatment(data[checkOutlierCol])

            print(checkOutlierCol, " : ", iqr)
            print(checkOutlierCol, " : ",lower)
            print(checkOutlierCol, " : ",upper)

            Upper_outliers =data.loc[data[checkOutlierCol] > upper]
            above_upper_index =data.loc[data[checkOutlierCol] > upper].index
            Lower_outliers =data.loc[data[checkOutlierCol] < lower]
            print(checkOutlierCol, ": Number of rows present above upper cutoff : ", Upper_outliers.shape)
            print(checkOutlierCol, ": Number of rows present below lower cutoff : ",Lower_outliers.shape)
            print(checkOutlierCol , ': Percentiles: 25th=%.3f, 75th=%.3f, 50th=%.3f, IQR=%.3f' % (q25, q75, q50, iqr))
            return above_upper_index

In [None]:
index = checkOutlier(df_combined,'height')

In [None]:
index1 = checkOutlier(df_combined,'location_los')

In [None]:
for i,j in Label_map.items():
    if 'Other' not in Label_map[i].keys():
        ot = max(Label_map[i].values())+1
        Label_map[i]['Other']= ot
        l = i+'_'+str(ot)
        df_combined = df_combined.assign(**dict.fromkeys([l], 0))
    else:
        continue

In [None]:
df_combined['target'].value_counts()

In [None]:
class NumpyEncoder(json.JSONEncoder):
    """ Custom encoder for numpy data types """
    def default(self, obj):
        if isinstance(obj, (np.int_, np.intc, np.intp, np.int8,
                            np.int16, np.int32, np.int64, np.uint8,
                            np.uint16, np.uint32, np.uint64)):

            return int(obj)

        elif isinstance(obj, (np.float_, np.float16, np.float32, np.float64)):
            return float(obj)

        elif isinstance(obj, (np.complex_, np.complex64, np.complex128)):
            return {'real': obj.real, 'imag': obj.imag}

        elif isinstance(obj, (np.ndarray,)):
            return obj.tolist()

        elif isinstance(obj, (np.bool_)):
            return bool(obj)

        elif isinstance(obj, (np.void)): 
            return None

        return json.JSONEncoder.default(self, obj)

In [None]:
with open('Label_map.json', 'w') as file:
    json.dump(Label_map, file, indent=4, sort_keys=True,
              separators=(', ', ': '), ensure_ascii=False,
              cls=NumpyEncoder)

In [None]:
df_final_processed_data_no_duplicate = df_combined.drop_duplicates(keep='first')

In [None]:
ls = list(df_final_processed_data_no_duplicate.columns)

In [None]:
ls_remove =['patientmasterkey','cpt_code', 'rev_code', 'diagcode', 'care_level','label']

In [None]:
for i in ls_remove:
    ls.remove(i)

In [None]:
testSize = 0.2
seed = 7
k = 5

In [None]:
x = df_sampled_data.drop(['target'],axis=1)
y = pd.DataFrame(df_sampled_data['target'],columns=['target'],index=x.index)

In [None]:
import pandas as pd 
import numpy as np
import os
os.chdir('C:/Users/p_adi/Downloads')

In [None]:
data1 = pd.read_csv('data_ts.csv')
data1.head()

In [None]:
import datetime as dt
data1['Date'] = pd.to_datetime(data1['Departure_Date'])
df_filtered = data1[(data1['Date'].dt.year == 2022)]

In [None]:
df_filtered.head()

In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from statsmodels.tsa.seasonal import seasonal_decompose
from statsmodels.tsa.holtwinters import ExponentialSmoothing

In [None]:

# Plot the time series to visualize its trends and patterns
data1['Total_Available_Seats'].plot(figsize=(12,6))
plt.show()

In [None]:
df = df_filtered[['Date', 'MARKET', 'Total_Available_Seats', 'Cum_Customer_Segment_Count']]

In [None]:
df

In [None]:
df.isnull().sum()

In [None]:
df['Total_Available_Seats'].interpolate(method ='linear', limit_direction ='backward', inplace=True)

In [None]:
# # Convert the date column to a pandas datetime object
# df['Departure_Date'] = pd.to_datetime(df['Departure_Date'])

# # Set the date column as the index of the dataframe
# df.set_index('Departure_Date', inplace=True)

In [None]:
df

In [None]:
# df.reset_index(inplace = True)

In [None]:
# # Convert the date column to a pandas datetime object
# df['Date'] = pd.to_datetime(df['Date'])

# # Set the date column as the index of the dataframe
# df.set_index('Date', inplace=True)
# df.index = pd.to_datetime(df['Date'].index)
# df.index.freq = pd.infer_freq(df.index)
# df_resampled = df.resample('D').sum()



# convert the date column to a datetime object
df['Date'] = pd.to_datetime(df['Date'])

# set the date column as the index
df.set_index('Date', inplace=True)
# # create a DatetimeIndex with a daily frequency
# idx = pd.date_range(start=df.index.min(), end=df.index.max(), freq='D')

# # create a new dataframe with the missing dates filled in
# df_resampled = df.reindex(idx)

# create a PeriodIndex with a daily frequency
df_resampled = df_resampled.to_period('D')

# # Convert the index to a PeriodIndex
# df.index = pd.PeriodIndex(df['Departure_Datetime_Local'].index, freq='D')

# # Resample the data using a PeriodIndex with a valid frequency set
# df_resampled = df.resample('D').sum()


In [None]:
df.reset_index(inplace = True)

In [None]:
df

In [None]:
df.set_index('Date', inplace=True)
df = df[~df.index.duplicated(keep='first')]
new_index = pd.date_range(start=df.index.min(), end=df.index.max())
# # reindex the DataFrame with the new index, filling missing values with NaN
df = df.reindex(new_index)

In [None]:
df.isnull().sum()

In [None]:
# df = df.reindex(new_index, method='ffill')

In [None]:
df = df.dropna(subset=['Total_Available_Seats'])
df = df.dropna(subset=['MARKET'])
df = df.dropna(subset=['Date'])

In [None]:
df.dropna(inplace = True)

In [None]:
# Decompose the time series into its components using an additive model
result_add = seasonal_decompose(df['Cum_Customer_Segment_Count'], model='additive')
result_add.plot()
plt.show()

In [None]:





# Decompose the time series into its components using a multiplicative model
result_mult = seasonal_decompose(df, model='multiplicative')
result_mult.plot()
plt.show()


# Fit an Exponential Smoothing model using an additive trend and additive seasonal component
model_add = ExponentialSmoothing(df, trend='add', seasonal='add').fit()

# Fit an Exponential Smoothing model using a multiplicative trend and multiplicative seasonal component
model_mult = ExponentialSmoothing(df, trend='mul', seasonal='mul').fit()

# Generate forecasted values for the next 30 days using the additive model
forecast_add = model_add.forecast(steps=30)

# Generate forecasted values for the next 30 days using the multiplicative model
forecast_mult = model_mult.forecast(steps=30)

# Plot the forecasted values against the actual values
plt.plot(df.index, df, label='Actual')
plt.plot(forecast_add.index, forecast_add, label='Additive Forecast')
plt.plot(forecast_mult.index, forecast_mult, label='Multiplicative Forecast')
plt.legend()
plt.show()

In [None]:
pip install --upgrade setuptools

In [None]:
pip install googleapis_common_protos

In [None]:
pip cache purge

In [None]:
pip install googleapis_common_protos

In [None]:
pip install faiss

In [2]:
import numpy as np
import pandas as pd
import faiss

In [64]:
# Read data from CSV file
import os
os.chdir('C:/Users/p_adi/Downloads')

In [65]:
data = pd.read_csv('relapse_faiss.csv')

  data = pd.read_csv('relapse_faiss.csv')


In [66]:
data.head()

Unnamed: 0,patientmasterkey,cpt_code,rev_code,diagcode,gender,dob,paymentmethod,ethnicity,comorbidity,care_level,admission_date,label,weight,height,location_los,program_los,levelofcare_los
0,1028_1093,"S9475,90837,INTEREST,H0018","0906,0912,INT,1002,0913,0126","F33.1,F33.1,F41.1,F12.90,,,,,,,",Female,1994-01-13,Insurance Tx and Labs,,,"IOP,Ancillary,PHP,Custom,RTC,Detox",2020-08-03 15:00:00,severe,215.0,67,48,23,19
1,1028_1138,H0018,100201280126,"F43.10,F43.10,F41.1,F10.20,F10.929,F51.4,F12.1...",Male,1966-01-19,Insurance Tx and Labs,Other,,"RTC,Detox",2020-09-05 11:00:00,less_severe,279.9,75,19,13,12
2,1028_1222,H0018,10020126,"F12.20,F12.20,F41.1,F40.10,M10.071,F17.210,,,,,",Male,1972-08-31,Insurance Tx and Labs,Other,,"RTC,Detox",2020-10-19 16:30:00,other,168.0,69,10,5,5
3,1028_1228,"S9475,H0018",091310020912,"F40.10,F40.10,F10.20,,,,,,,,",Male,1983-09-06,Insurance Tx and Labs,Other,,"PHP,RTC",2020-10-29 17:00:00,less_severe,174.4,71,16,8,8
4,1028_1241,"S0201,H0018",091301261002,"F10.239,F10.239,F17.208,,,,,,,,",Male,1975-10-24,Insurance Tx and Labs,Other,,"PHP,Detox,RTC",2020-11-06 17:00:00,other,151.0,73,16,10,8


In [67]:
categorical_column_name = ['patientmasterkey', 'cpt_code', 'gender', 'dob', 'paymentmethod', 'ethnicity', 'comorbidity', 'care_level', 'label']

In [69]:
data = pd.get_dummies(data, columns=categorical_column_name)

In [46]:
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 97843 entries, 0 to 97842
Data columns (total 17 columns):
 #   Column            Non-Null Count  Dtype 
---  ------            --------------  ----- 
 0   patientmasterkey  97843 non-null  object
 1   cpt_code          83989 non-null  object
 2   rev_code          58812 non-null  object
 3   diagcode          97843 non-null  object
 4   gender            97711 non-null  object
 5   dob               97843 non-null  object
 6   paymentmethod     97636 non-null  object
 7   ethnicity         46745 non-null  object
 8   comorbidity       5794 non-null   object
 9   care_level        97843 non-null  object
 10  admission_date    97843 non-null  object
 11  label             97843 non-null  object
 12  weight            97843 non-null  object
 13  height            97843 non-null  int64 
 14  location_los      97843 non-null  int64 
 15  program_los       97843 non-null  int64 
 16  levelofcare_los   97843 non-null  int64 
dtypes: int64(4),

In [47]:
type(data)

pandas.core.frame.DataFrame

In [50]:
# Define preprocess_data function
def preprocess_data(p_keys, cpt_codes, gender, ethnicity, location_ids, c_level, target):
    # Implement your data preprocessing logic here
    # This is just a dummy function, replace with your actual data preprocessing logic
    return [p_keys, cpt_codes, gender, ethnicity, location_ids, c_level, target]

In [51]:



# Extract relevant columns from the DataFrame
# Assuming your CSV file has columns 'medical_history', 'diagnoses', 'treatments', and 'target' containing the relevant data

p_keys = data['patientmasterkey'].tolist()
cpt_codes = data['cpt_code'].tolist()
gender = data['gender'].tolist()
ethnicity  = data['ethnicity'].tolist()
location_ids = data['location_los'].tolist()
c_level = data['care_level'].tolist()
target = data['label'].tolist()

# Preprocess data and create feature representations
# Here, let's assume you have a function called 'preprocess_data' that takes raw data and preprocesses it to create feature vectors
data1 = []
for i in range(len(p_keys)):
    feature_vector = preprocess_data(p_keys[i], cpt_codes[i], gender[i], ethnicity[i], location_ids[i], c_level[i], target[i])
    data1.append(feature_vector)

# Convert data to numpy array
data1 = np.array(data1)



In [52]:
data1

array([['1028_1093', 'S9475,90837,INTEREST,H0018', 'Female', ..., '48',
        'IOP,Ancillary,PHP,Custom,RTC,Detox', 'severe'],
       ['1028_1138', 'H0018', 'Male', ..., '19', 'RTC,Detox',
        'less_severe'],
       ['1028_1222', 'H0018', 'Male', ..., '10', 'RTC,Detox', 'other'],
       ...,
       ['947_9729', 'nan', 'Female', ..., '16', 'Detox,RTC', 'other'],
       ['947_9737', '80305,S0201,H0035,J2315,96372', 'Male', ..., '18',
        'Ancillary,PHP,RTC,Detox', 'other'],
       ['947_9828', '96372,S0201,80305,J2315,H0015,87811,90837', 'Male',
        ..., '42', 'Ancillary,PHP,Detox,RTC,IOP,TBD,OP', 'other']],
      dtype='<U320')

In [None]:
p_keys, cpt_codes, gender, ethnicity, location_ids, c_level, target

In [56]:
# Build Faiss index
d = data1.shape[1] # dimension of feature vectors

In [59]:
index = faiss.IndexFlatL2(d)  # create L2 distance index


In [60]:
index

<faiss.swigfaiss.IndexFlatL2; proxy of <Swig Object of type 'faiss::IndexFlatL2 *' at 0x000001F4AABEF2A0> >

In [61]:
index.add(data1)  # add data to the index

ValueError: could not convert string to float: 'S9475,90837,INTEREST,H0018'

In [58]:





# Query for similar patients
query_patient = preprocess_data(query_p_keys, query_cpt_codes, query_gender, query_ethnicity, query_location_ids, query_c_level, query_target)  # preprocess query patient data
query_patient = np.array([query_patient])  # convert to numpy array
_, similar_patient_indices = index.search(query_patient, k=5)  # search for 5 most similar patients

# Get similar patients' IDs from the original data
similar_patient_ids = data.iloc[similar_patient_indices[0]]['patientmasterkey'].tolist()  # assuming you have a 'patient_id' column in your CSV file

# Print similar patient IDs
print("Similar Patients: ", similar_patient_ids)


NameError: name 'query_p_keys' is not defined

In [None]:
# !pip install libomp-dev

In [None]:
# !pip install faiss-cpu --no-cache