# base code for extracting data from Snowlake

In [8]:
from snowflake.snowpark import Session
import snowflake.connector
import logging
import requests
import json
import configparser
import sys, os
import warnings
import base64
import glob
import pandas as pd  # Make sure to import pandas
import numpy as np


import numpy as np# Setup and cleanup
sys.path.append(os.path.realpath('..'))
warnings.filterwarnings("ignore")
os.system('cls' if os.name == 'nt' else 'clear')
masterdir = os.getcwd()
print(masterdir)

# Configuration
config = configparser.ConfigParser()
config.read(os.path.join(masterdir, '03-configfiles', 'pw-config.ini'))

# credentials pulled in from hidden config file
acct = config.get('snowflake', 'acct')
user = config.get('snowflake', 'user')
pwd = config.get('snowflake', 'password')
role = config.get('snowflake', 'role')
wh = config.get('snowflake', 'warehouse')
db = config.get('snowflake', 'database')
sch = config.get('snowflake', 'schema')

print('Snakeflow connection.........')
print('Account:',acct)
print('User:',user)
print('db:',db)
print('..............................')
print('')

try:
    # Configure a connection
    conn = snowflake.connector.connect(
        user=user,
        password=pwd,
        account=acct,
        warehouse=wh,
        database=db,
        schema=sch
    )

    # Create a cursor object
    cur = conn.cursor()
    
    # Execute the query
    cur.execute("SELECT * FROM surveydata")

    # Fetch all the results
    results = cur.fetchall()

    # Convert results into a DataFrame
    # Assuming `cur.description` contains the column names
    column_names = [col[0] for col in cur.description]
    df = pd.DataFrame(results, columns=column_names)

    # Display the DataFrame
    print(df.info())  # This will print the first few rows of the DataFrame

finally:
    # Make sure to close the cursor and connection when done
    if 'cur' in locals():
        cur.close()
    if 'conn' in locals():
        conn.close()


/Users/danramirez/MBS[H[2J
Snakeflow connection.........
Account: edlastp-nq46411
User: DANRAMIREZMBS
db: SAMPLEDATA
..............................

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1231 entries, 0 to 1230
Data columns (total 75 columns):
 #   Column                                                                                                      Non-Null Count  Dtype  
---  ------                                                                                                      --------------  -----  
 0   ID                                                                                                          1231 non-null   object 
 1   GENDER                                                                                                      1212 non-null   object 
 2   ETHNICITY                                                                                                   1107 non-null   object 
 3   EDUCATION                                           

# Regression example

In [7]:
import pandas as pd
import statsmodels.api as sm
import numpy as np  # Importing numpy
from sklearn.model_selection import train_test_split

clustercount = 3
model = 'OLS'


# Cluster flag creatoin-------------------------------------
for i in range(clustercount):
    conditions = [
        (df['CLUSTER'] == i+1)
    ]
    values = [1]
    df['cluster ' + str(i+1)] = np.select(conditions, values, default=0)

# Filter X variables (predictors) and Y variable (response)
X = df.filter(regex=("^XVAR_")) 
Y = df['Y_OVERALL_SAT_SHOPPINGEXPERIENCE_TOP2B']

# Adding a constant to the model (intercept)
X = sm.add_constant(X)

models = []  # For storing model summaries

for i in range(clustercount):
    # Y variable inputs----------------------------------
    cluster_var_name = 'cluster ' + str(i+1)
    y_cluster = df[cluster_var_name]  # Using a clear variable name for clarity
    
    X_train, X_test, y_train, y_test = train_test_split(X, y_cluster, test_size=0.25, random_state=0)
    
    if model == 'OLS':
        model_var = sm.OLS(y_train, X_train).fit()
    elif model == 'LOGIT':
        model_var = sm.Logit(y_train, X_train).fit()
    
    models.append(model_var.summary2())  # Storing model summaries

# If you need to print or use each model summary, you can iterate over `models` list
for summary in models:
    print(summary)


                                                             Results: Ordinary least squares
Model:                                            OLS                                           Adj. R-squared:                                  0.142    
Dependent Variable:                               cluster 1                                     AIC:                                             1190.3853
Date:                                             2024-02-20 12:18                              BIC:                                             1451.0772
No. Observations:                                 923                                           Log-Likelihood:                                  -541.19  
Df Model:                                         53                                            F-statistic:                                     3.870    
Df Residuals:                                     869                                           Prob (F-statistic):                 