In [1]:
#!pip install streamlit
import numpy as np
import pandas as pd
import random
from sklearn.linear_model import LinearRegression
import matplotlib.pyplot as plt

# Clustering
from sklearn.cluster import KMeans
# conda install -c conda-forge scikit-learn-extra
from sklearn_extra.cluster import KMedoids
from sklearn.cluster import AgglomerativeClustering
from sklearn.metrics import silhouette_samples, silhouette_score
from operator import itemgetter

# Principal Components Analysis
from scipy import stats
from sklearn.decomposition import PCA

# Classification
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.svm import SVC
from sklearn.neighbors import KNeighborsClassifier
import itertools

## Import Data

In [2]:
# Import Data
df = pd.read_csv('dgn_raw_data.csv')

# Add very small random number to Rating
df['target']=df['Rating'].apply(lambda x: x+random.random()/1000)

## Regressions for Each UID

In [3]:
# Unique IDs
ids = df.UID.unique()

# Run linear regressions for each UID
op = pd.DataFrame
intercept = []
coefficients=[]
UID = []
for i in ids:
    df_i = df[df.UID == i]              # Create dataframe for current user id
    X = df_i.drop(['UID','Rating','target'], axis=1)  # df input variables only
    y = df_i['target']                  # Series of target variable
    reg = LinearRegression().fit(X, y)  # Fit linear regression
    reg.score(X, y)                     # Score regression model
    unique_id=df_i['UID'].unique()      # Saves current user id
    const = reg.intercept_              # Save intercept of the regression model
    coef = reg.coef_                    # Coefficients of regression model
    UID.append(unique_id)               # Append current user id
    intercept.append(const)             # Append current intercept
    coefficients.append(coef)           # Append current regression coefficients

# Convert newly created lists into dataframes
intercep_new = pd.DataFrame(intercept)
coefficients_new = pd.DataFrame(coefficients)
UID_new = pd.DataFrame(UID)

# Get columns names
colNames = df.drop(['Rating', 'target',], axis=1).columns
colNames = colNames.insert(1, 'Const')
colNames

# Concatenate the new dataframes and add column names
op = pd.concat([UID_new,intercep_new, coefficients_new], axis=1)
op.columns = colNames

# Save only regression coefficients for clustering
scores = op.drop(['UID','Const'], axis=1)

### Hidden: Code for the regressions on variable groups A, B, C, D

In [None]:
# ### Linear Regression with A Variables

# # Unique IDs
# ids = df.UID.unique()

# # Run linear regressions for each UID
# op = pd.DataFrame
# intercept = []
# coefficients=[]
# UID = []
# for i in ids:
#     df_i = df[df.UID == i]              # Create dataframe for current user id
#     X = df_i.drop(['UID','B1', 'B2', 'B3', 'B4','C1', 'C2', 'C3', 'C4','D1', 'D2', 'D3', 'D4','Rating','target'], axis=1)  # df input variables only
#     y = df_i['target']                  # Series of target variable
#     reg = LinearRegression().fit(X, y)  # Fit linear regression
#     reg.score(X, y)                     # Score regression model
#     unique_id=df_i['UID'].unique()      # Saves current user id
#     const = reg.intercept_              # Save intercept of the regression model
#     coef = reg.coef_                    # Coefficients of regression model
#     UID.append(unique_id)               # Append current user id
#     intercept.append(const)             # Append current intercept
#     coefficients.append(coef)           # Append current regression coefficients

# # Convert newly created lists into dataframes
# intercep_new = pd.DataFrame(intercept)
# coefficients_new = pd.DataFrame(coefficients)
# UID_new = pd.DataFrame(UID)

# # Get columns names
# colNames = df.drop(['B1', 'B2', 'B3', 'B4','C1', 'C2', 'C3', 'C4','D1', 'D2', 'D3', 'D4','Rating', 'target'], axis=1).columns
# colNames = colNames.insert(1, 'Const')
# colNames

# # Concatenate the new dataframes and add column names
# op = pd.concat([UID_new,intercep_new, coefficients_new], axis=1)
# op.columns = colNames
# op_A = op.iloc[:,2:6]


# #******************************************************************************#
# ### Linear Regression with B Variables

# # Unique IDs
# ids = df.UID.unique()

# # Run linear regressions for each UID
# op = pd.DataFrame
# intercept = []
# coefficients=[]
# UID = []
# for i in ids:
#     df_i = df[df.UID == i]              # Create dataframe for current user id
#     X = df_i.drop(['UID','A1', 'A2', 'A3', 'A4','C1', 'C2', 'C3', 'C4','D1', 'D2', 'D3', 'D4','Rating','target'], axis=1)  # df input variables only
#     y = df_i['target']                  # Series of target variable
#     reg = LinearRegression().fit(X, y)  # Fit linear regression
#     reg.score(X, y)                     # Score regression model
#     unique_id=df_i['UID'].unique()      # Saves current user id
#     const = reg.intercept_              # Save intercept of the regression model
#     coef = reg.coef_                    # Coefficients of regression model
#     UID.append(unique_id)               # Append current user id
#     intercept.append(const)             # Append current intercept
#     coefficients.append(coef)           # Append current regression coefficients

# # Convert newly created lists into dataframes
# intercep_new = pd.DataFrame(intercept)
# coefficients_new = pd.DataFrame(coefficients)
# UID_new = pd.DataFrame(UID)

# # Get columns names
# colNames = df.drop(['A1', 'A2', 'A3', 'A4','C1', 'C2', 'C3', 'C4','D1', 'D2', 'D3', 'D4','Rating', 'target'], axis=1).columns
# colNames = colNames.insert(1, 'Const')
# colNames

# # Concatenate the new dataframes and add column names
# op = pd.concat([UID_new,intercep_new, coefficients_new], axis=1)
# op.columns = colNames
# op_B = op.iloc[:,2:6]


# #******************************************************************************#
# ### Linear Regression with C Variables

# # Unique IDs
# ids = df.UID.unique()

# # Run linear regressions for each UID
# op = pd.DataFrame
# intercept = []
# coefficients=[]
# UID = []
# for i in ids:
#     df_i = df[df.UID == i]              # Create dataframe for current user id
#     X = df_i.drop(['UID', 'A1', 'A2', 'A3', 'A4', 'B1', 'B2', 'B3', 'B4', 'D1', 'D2', 'D3', 'D4', 'Rating', 'target'], axis=1)  # df input variables only
#     y = df_i['target']                  # Series of target variable
#     reg = LinearRegression().fit(X, y)  # Fit linear regression
#     reg.score(X, y)                     # Score regression model
#     unique_id=df_i['UID'].unique()      # Saves current user id
#     const = reg.intercept_              # Save intercept of the regression model
#     coef = reg.coef_                    # Coefficients of regression model
#     UID.append(unique_id)               # Append current user id
#     intercept.append(const)             # Append current intercept
#     coefficients.append(coef)           # Append current regression coefficients

# # Convert newly created lists into dataframes
# intercep_new = pd.DataFrame(intercept)
# coefficients_new = pd.DataFrame(coefficients)
# UID_new = pd.DataFrame(UID)

# # Get columns names
# colNames = df.drop(['A1', 'A2', 'A3', 'A4', 'B1', 'B2', 'B3', 'B4', 'D1', 'D2', 'D3', 'D4', 'Rating', 'target'], axis=1).columns
# colNames = colNames.insert(1, 'Const')
# colNames

# # Concatenate the new dataframes and add column names
# op = pd.concat([UID_new,intercep_new, coefficients_new], axis=1)
# op.columns = colNames
# op_C = op.iloc[:,2:6]



# #******************************************************************************#
# ### Linear Regression with D Variables

# # Unique IDs
# ids = df.UID.unique()

# # Run linear regressions for each UID
# op = pd.DataFrame
# intercept = []
# coefficients=[]
# UID = []
# for i in ids:
#     df_i = df[df.UID == i]              # Create dataframe for current user id
#     X = df_i.drop(['UID', 'A1', 'A2', 'A3', 'A4', 'B1', 'B2', 'B3', 'B4', 'C1', 'C2', 'C3', 'C4', 'Rating', 'target'], axis=1)  # df input variables only
#     y = df_i['target']                  # Series of target variable
#     reg = LinearRegression().fit(X, y)  # Fit linear regression
#     reg.score(X, y)                     # Score regression model
#     unique_id=df_i['UID'].unique()      # Saves current user id
#     const = reg.intercept_              # Save intercept of the regression model
#     coef = reg.coef_                    # Coefficients of regression model
#     UID.append(unique_id)               # Append current user id
#     intercept.append(const)             # Append current intercept
#     coefficients.append(coef)           # Append current regression coefficients

# # Convert newly created lists into dataframes
# intercep_new = pd.DataFrame(intercept)
# coefficients_new = pd.DataFrame(coefficients)
# UID_new = pd.DataFrame(UID)

# # Get columns names
# colNames = df.drop(['A1', 'A2', 'A3', 'A4', 'B1', 'B2', 'B3', 'B4', 'C1', 'C2', 'C3', 'C4', 'Rating', 'target'], axis=1).columns
# colNames = colNames.insert(1, 'Const')
# colNames

# # Concatenate the new dataframes and add column names
# op = pd.concat([UID_new,intercep_new, coefficients_new], axis=1)
# op.columns = colNames
# op_D = op.iloc[:,2:6]


# #******************************************************************************#
# ### Replace 1's w/ Regression Coefficients in Original Data

# # Concatenate regression dataframes
# all_cfs = pd.concat([op_A, op_B, op_C, op_D], axis=1)

# # Replace 1's w/ regression coefficients by column
# cfs_cols = all_cfs.columns

# for col in cfs_cols:
#     for i in range(1,len(all_cfs)+1):
#         df.loc[df['UID'] == i,[col]] = df.loc[df['UID'] == i,[col]].replace(1,all_cfs.loc[i-1,col])

# df.head()

## PCA on Regression Coefficients

In [75]:
#******************************************************************************#
# Comparing covariance and correlation PCA.  We should do correlation PCA.

# Create PCA dataframe
df_fct = op.drop(['UID','Const'], axis=1)

# Standardize df_fct
# df_pca = stats.zscore(df_fct)  # Didn't need to worry about standardizing.
# Could build in a correlation/covariance PCA thing using ranges of the variables

# Create PCA object
pca = PCA(random_state=123)

# Get principal components
pca.fit(df_fct)

# Get scores
pca.transform(df_fct)

# Components needed
pcs_needed = len(np.where(pca.explained_variance_ >= 1)[0])

# Save scores for PCs w/ eignvalues >=1 as dataframe for clustering
scores = pd.DataFrame(pca.transform(df_fct))
scores = scores.iloc[:,0:pcs_needed]

# How many observations
n_samples = pca.components_.shape[0]

# Transpose the principal components
murph = pca.components_.T
# Center the data
murph -= np.mean(murph, axis=0)
# Compute the covariance matrix
cov_matrix = np.dot(murph.T, murph) / n_samples

for eigenvector in pca.components_:
    print(np.dot(eigenvector.T, np.dot(cov_matrix, eigenvector)))

n_samples = df_fct.shape[0]

pca = PCA(random_state=123)
X_transformed = pca.fit_transform(df_fct)

# This is the explained variance.  They are big numbers because this is Cov. PCA
X_centered = df_fct - np.mean(df_fct, axis=0)
cov_matrix = np.dot(X_centered.T, X_centered) / n_samples
eigenvalues = pca.explained_variance_
for eigenvalue, eigenvector in zip(eigenvalues, pca.components_):
    print(eigenvalue)

# This is the same thing as the code above but built into sklearn
pca.explained_variance_
# These are the eigenvalues of the covariance matrix
# They don't have values near 1-ish, so I need to understand why.


#******************************************************************************#
# Biplots for comparing correlation and covariance PCA for this data

# !pip install pca
from pca import pca

# Correlation PCA & biplot
model = pca(n_components=4)
results = model.fit_transform(df_pca)
fig, ax = model.biplot(n_feat=16, cmap=None, label=False, legend=False)

# Covariance PCA & biplot
model2 = pca(n_components=4)
results2 = model2.fit_transform(df_fct)
fig, ax = model2.biplot(n_feat=16, cmap=None, label=False, legend=False)

In [4]:
# # Create PCA dataframe
# df_fct = op.drop(['UID','Const'], axis=1)

# # Standardize df_fct - Need to standardize to use eigenvalues >= 1
# df_fct = stats.zscore(df_fct)

# # Create PCA object
# pca = PCA(random_state=123)

# # Get principal components & scores
# pca.fit_transform(df_fct)

# # Components needed
# pcs_needed = len(np.where(pca.explained_variance_ >= 1)[0])

# # Save scores for PCs w/ eignvalues >=1 as dataframe for clustering
# scores = pd.DataFrame(pca.transform(df_fct))
# scores = scores.iloc[:,0:pcs_needed]
# expl_var = round(pca.explained_variance_ratio_.cumsum()[pcs_needed-1]*100, 2)

# print("Principal Components Used: ", pcs_needed, sep='')
# print("Variance Explained: ", expl_var, "%", sep='')

Principal Components Used: 4
Variance Explained: 72.67%


## Cluster on Regression Coefficients

In [5]:
# Create a Pandas Excel writer using XlsxWriter as the engine.
writer = pd.ExcelWriter('pandas_multiple.xlsx', engine='xlsxwriter')

# All maps for all cluster solutions
all_maps = []

# Column of last cluster solution
last_var = op.shape[1]

for i in range(2,7):
    
    sw = []
    
    # Create clustering objects
    cls1 = KMeans(n_clusters=i, random_state=0)
    cls2 = KMedoids(n_clusters=i, random_state=0)
    cls3 = AgglomerativeClustering(n_clusters=i,
                                   affinity='euclidean',
                                   linkage='ward')
        # Agglomerative clustering: if linkage=ward, affinity must be Euclidean
    cls_algs = [['kMeans', cls1],
                ['kMedoids', cls2],
                ['Hierarchical', cls3]]
    
    # Fit and score clustering solutions for i clusters w/ each algorithm
    for cls in cls_algs:
        
        # Fit the model to the factor analysis scores
        cls[1].fit(scores)
        
        # List of assigned clusters
        clusters = cls[1].fit_predict(scores)
        
        # Silhouette scores for each solution
        silhouette_avg = silhouette_score(scores,clusters)
        
        # Store solution info
        algorithm = cls[0]
        i_stats = [algorithm, i, silhouette_avg, clusters]
        sw.append(i_stats)

    # Reorder cluster lists by descending silhouette scores.
    # Clusters in first element should be assigned to training data.
    sw = sorted(sw, key=itemgetter(2), reverse=True)
    op[f'Optimal {sw[0][1]} cluster solution ({sw[0][0]})'] = sw[0][3]

    # This is the big for loop
    for i in range(18, last_var):
        df_cl = op.iloc[:,np.r_[2:18,i]]  # i is the current cluster solution

        #**********************************************************************#

        # Split data into 70% training, 30% validation
        train, valid = train_test_split(df_cl, test_size=0.30, random_state=123)

        # X is unlabeled training data, y is true training labels 
        X, y = train.iloc[:,0:-1], train.iloc[:,-1]

        X_valid, y_valid = valid.iloc[:,0:-1], valid.iloc[:,-1]

        #**********************************************************************#

        # Get variable importances

        clf1 = RandomForestClassifier(random_state=0)
        clf2 = GradientBoostingClassifier(random_state=0)

        classifiers = [['rf', clf1], ['gbt', clf2]]

        for classifier in classifiers:    
            # Fit classifier to training data
            classifier[1].fit(X,y)    

        # Create variable importance dataframe
        num_vars = list(range(1,len(clf1.feature_importances_)+1))
        importance = pd.DataFrame({'variable': num_vars,
                                   'rf': clf1.feature_importances_,
                                   'gbt': clf2.feature_importances_,})

        # Average variable importance of rf and gbt models
        importance['avg'] = (importance['rf']+importance['gbt'])/2

        # Put avg importances on a scale from 0 to 1 to make it easier to visualize
        importance['Relative Importance'] = np.interp(importance['avg'],
                                                      (importance['avg'].min(),
                                                       importance['avg'].max()),
                                                      (0, 1))

        # View top 10 variables when RF and GBT models are averaged
        top_10_avg = importance.sort_values(by='avg', ascending=False)[['avg','Relative Importance']].head(10)

        # Add variable rank column to dataframe
        importance_rank = num_vars
        importance = importance.sort_values(by='Relative Importance', ascending=False)
        importance['rank'] = importance_rank
        importance.reset_index(inplace=True)

        # Save index of top 5 variables (not the variable number!)
        top_5 = importance[importance['rank'] <= 5]['index']

        #**********************************************************************#
        # Convert data to binary, train classifiers, score validation, create maps

        # Convert X, X_valid, and df_cl predictors to all 1 and -1
        X = (X.mask(df > 0, other=1, inplace=False)
             .mask(df <= 0, other=-1, inplace=False))
        X_valid = (X_valid.mask(df > 0, other=1, inplace=False)
                   .mask(df <= 0, other=-1, inplace=False))
        all_data_masked = (df_cl.iloc[:,0:-1].mask(df > 0, other=1, inplace=False)
                           .mask(df <= 0, other=-1, inplace=False))

        map_collection = []

        # Retrain on the 2-5 most important variables
        for j in range(2,6):

            clf_scores = []

            clf1 = RandomForestClassifier(random_state=0)
            clf2 = GradientBoostingClassifier(random_state=0)
            clf3 = SVC(random_state=0)
            clf4 = KNeighborsClassifier()

            classifiers = [['rf', clf1], ['gbt', clf2], ['svc', clf3], ['knn', clf4]]

            for classifier in classifiers:

                # Fit classifier to training data
                classifier[1].fit(X.iloc[:,np.r_[top_5[0:j]]],y)

                # Store classifier-specific results [algorithm object, classifier name, scores]
                results = [classifier[1],
                           classifier[0],
                           classifier[1].score(X_valid.iloc[:,np.r_[top_5[0:j]]],y_valid)]

                # Overall classifier results
                clf_scores.append(results)

            # Sort classifier accuracy in descending order
            clf_scores = sorted(clf_scores, key=itemgetter(2), reverse=True)
            # clf_scores[0][0] is the best model
            
            # Fit the best model on all data
            best_model = clf_scores[0][0].fit(all_data_masked.iloc[:,np.r_[top_5[0:j]]], df_cl.iloc[:,-1])

            # Score validation data, get predictions
#             X_valid_sub = X_valid.iloc[:,np.r_[top_5[0:j]]]
#             score = clf_scores[0][0].score(X_valid_sub, y_valid)
#             preds = clf_scores[0][0].predict(X_valid_sub)

            # Create mappings
            
            # Creates grid of dimension j
            grid = pd.DataFrame(list(itertools.product([-1,1], repeat=j)))

            # This is the best model predicting the grid
            preds = best_model.predict(grid)

            # Add to grid dataframe
            grid['preds'] = preds

            # Change grid to mapping to fit into the rest of the code
            mapping = grid

            # Save current mapping to map collection for this cluster solution
            map_collection.append(mapping)

            # Write each dataframe to a different worksheet.
            mapping.to_excel(writer, sheet_name=f"{df_cl.columns[-1][8:17]}s, {j} vars")

        all_maps.append(map_collection)

writer.save()

In [62]:
clf_scores[0][0].fit(murph, df_cl.iloc[:,-1])

RandomForestClassifier(bootstrap=True, ccp_alpha=0.0, class_weight=None,
                       criterion='gini', max_depth=None, max_features='auto',
                       max_leaf_nodes=None, max_samples=None,
                       min_impurity_decrease=0.0, min_impurity_split=None,
                       min_samples_leaf=1, min_samples_split=2,
                       min_weight_fraction_leaf=0.0, n_estimators=100,
                       n_jobs=None, oob_score=False, random_state=0, verbose=0,
                       warm_start=False)

In [56]:
murph = (df_cl.iloc[:,0:-1].mask(df > 0, other=1, inplace=False)
         .mask(df <= 0, other=-1, inplace=False))

In [60]:
df_cl.iloc[:,-1]

0     1
1     0
2     1
3     1
4     1
     ..
95    1
96    1
97    1
98    1
99    1
Name: Optimal 2 cluster solution (kMeans), Length: 100, dtype: int32

In [45]:
# Creates grid of dimension j
grid = pd.DataFrame(list(itertools.product([-1,1], repeat=j)))

# This is the best model predicting the grid
preds = clf_scores[0][0].predict(grid)

# Add to grid dataframe
grid['preds'] = preds

# Change grid to mapping to fit into the rest of the code
mapping = grid

In [146]:
# I need to create a new dataset for each clustering solution
# I could create a separate dataframe of clustering solutions, each with the original variables and 1 clustering solution
# Then I split the first dataset
# Then I run it through the classifier loop
# I get the variable importance
# Save the top 5 variables
# Train the classifier on the top 2 variables

# 2 clusters, 2 variables
# Every 1,1 combination needs to be classified the same
# Every 1,-1 combination needs to be classified the same

# Use a NumPy meshgrid to map out all possible 1,-1 combinations
# Should end up with 25 different maps (5 cluster solutions with 5 variables)


In [None]:
# This is for making the letter-based regression dynamic

# Create a list of unique first letters of variables
X = df.drop(['UID','Rating','target'], axis=1)
y = df["target"]

var_letters = []

for i in X.columns:
    var_letters.append(i[0:1])  # Append first character

var_letters = list(np.unique(var_letters))  # List of unique variable letters



<b>Scenario Analysis</b>

In [46]:
df.head()

Unnamed: 0,UID,A1,A2,A3,A4,B1,B2,B3,B4,C1,C2,C3,C4,D1,D2,D3,D4,Rating,target
0,1,1,0,0,0,0,0,0,1,1,0,0,0,0,0,0,0,100,100.000507
1,1,1,0,0,0,0,0,1,0,0,0,0,0,1,0,0,0,100,100.000668
2,1,0,0,0,1,0,0,0,1,1,0,0,0,0,0,0,0,0,0.000386
3,1,0,1,0,0,0,0,0,0,0,0,0,1,0,1,0,0,0,6.7e-05
4,1,0,1,0,0,0,0,1,0,0,1,0,0,0,0,0,1,0,0.000382


In [19]:
# np.where(condition, value if condition is true, value if condition is false)

# create a list of the column categories

cat_A = ['A1', 'A2', 'A3', 'A4']
cat_B = ['B1', 'B2', 'B3', 'B4']
cat_C = ['C1', 'C2', 'C3', 'C4']
cat_D = ['D1', 'D2', 'D3', 'D4']

# create a list of our conditions
cat_A_conditions = [
    (df['A1'] == 1),
    (df['A2'] == 1),
    (df['A3'] == 1),
    (df['A4'] == 1),
    (df['A1']==0) & (df['A2']==0) & (df['A3']==0) & (df['A4']==0),
    ]

cat_B_conditions = [    
    (df['B1'] == 1),
    (df['B2'] == 1),
    (df['B3'] == 1),
    (df['B4'] == 1),
    (df['B1']==0) & (df['B2']==0) & (df['B3']==0) & (df['B4']==0),
    ]    

cat_C_conditions = [    
    (df['C1'] == 1),
    (df['C2'] == 1),
    (df['C3'] == 1),
    (df['C4'] == 1),
    (df['C1']==0) & (df['C2']==0) & (df['C3']==0) & (df['C4']==0),
    ]

cat_D_conditions = [    
    (df['D1'] == 1),
    (df['D2'] == 1),
    (df['D3'] == 1),
    (df['D4'] == 1),  
    (df['D1']==0) & (df['D2']==0) & (df['D3']==0) & (df['D4']==0),
    ]

# create a list of the values we want to assign for each condition
cat_A_values = ['A1', 'A2', 'A3', 'A4', 'A0']
cat_B_values = ['B1', 'B2', 'B3', 'B4','B0']
cat_C_values = ['C1', 'C2', 'C3', 'C4','C0']
cat_D_values = ['D1', 'D2', 'D3', 'D4','D0']
    
df['cat_A_scenario'] = np.select(cat_A_conditions, cat_A_values)
df['cat_B_scenario'] = np.select(cat_B_conditions, cat_B_values)
df['cat_C_scenario'] = np.select(cat_C_conditions, cat_C_values)
df['cat_D_scenario'] = np.select(cat_D_conditions, cat_D_values)

<b> Example of Scenario where Variable A1 has a 1 </b>

In [20]:
cat_A = df['cat_A_scenario'].unique()
# list of independant variables for regression
fields = df.columns[1:17]
dep_var = df['noise']

In [21]:
df_A1 = df[df['cat_A_scenario']=='A1']
df_A1

Unnamed: 0,UID,A1,A2,A3,A4,B1,B2,B3,B4,C1,...,D1,D2,D3,D4,Rating,noise,cat_A_scenario,cat_B_scenario,cat_C_scenario,cat_D_scenario
0,1,1,0,0,0,0,0,0,1,1,...,0,0,0,0,100,100.000893,A1,B4,C1,D0
1,1,1,0,0,0,0,0,1,0,0,...,1,0,0,0,100,100.000783,A1,B3,C0,D1
13,1,1,0,0,0,0,0,0,0,1,...,0,0,0,1,100,100.000825,A1,B0,C1,D4
16,1,1,0,0,0,0,0,0,1,0,...,0,0,1,0,0,0.000479,A1,B4,C4,D3
17,1,1,0,0,0,1,0,0,0,0,...,0,0,0,1,100,100.000252,A1,B1,C3,D4
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2383,100,1,0,0,0,0,0,0,1,0,...,0,0,0,1,5,5.000552,A1,B4,C0,D4
2385,100,1,0,0,0,0,0,1,0,0,...,0,0,1,0,5,5.000693,A1,B3,C2,D3
2393,100,1,0,0,0,0,0,0,1,0,...,0,1,0,0,5,5.000722,A1,B4,C4,D2
2398,100,1,0,0,0,1,0,0,0,0,...,0,0,0,0,5,5.000264,A1,B1,C4,D0


In [24]:
X =df_A1[['B1', 'B2', 'B3', 'B4','C1', 'C2', 'C3', 'C4','D1', 'D2', 'D3', 'D4']]
y= df_A1['noise']
reg = LinearRegression().fit(X, y)
reg.score(X, y)
const = reg.intercept_
coef = reg.coef_
print(const)
print(coef)

33.91035518762444
[ -3.84235504 -13.45891789  -5.98115784  -1.00151876  -5.2084735
  -0.95495595  -3.15645822  -7.2888877   -2.32297686  -3.7339574
  -5.08132817   1.23153583]


In [23]:
# scratch pad, not finished
for i in cat_A:
    df_i = df[df.cat_A_scenario == i]
    X =df_i[df_i.columns[1:17]]
    y= df_i['noise']
    reg = LinearRegression().fit(X, y)
    reg.score(X, y)
    const = reg.intercept_
    coef = reg.coef_
    intercept.append(const)
    coefficients.append(coef)
#print(UID)    
intercep_new = pd.DataFrame(intercept)
coefficients_new = pd.DataFrame(coefficients)