# Part 1: GMM Model with Stock-Return Data

In [29]:
import numpy as np
import pandas as pd
import statsmodels.api as sm
from statsmodels.sandbox.regression.gmm import GMM
import io

In [30]:
from google.colab import files

# Upload the file
uploaded = files.upload()

Saving midterm_partone.csv to midterm_partone (2).csv


In [31]:
# Get the filename and file contents
filename = list(uploaded.keys())[0]
file_content = uploaded[filename]

# Read the CSV file into a pandas DataFrame
data_part1 = pd.read_csv(io.BytesIO(file_content)) # Use io.BytesIO to handle the bytes data

# Check if data loaded correctly
print(data_part1.head())

   Constant  Stock Change  Inventory Turnover  Operating Profit  \
0         1      0.870332            1.795946          0.115846   
1         1     -0.047347            1.395501          0.436967   
2         1      0.001176            1.664563          0.541016   
3         1     -0.901200            1.605738          0.539399   
4         1     -0.176353            1.591451          0.539938   

   Interaction Effect  Current Ratio  Quick Ratio  Debt Asset Ratio  
0            0.208053       1.672527     0.255171          0.473317  
1            0.609788       1.637261     0.221763          0.489967  
2            0.900555       1.640619     0.189141          0.374269  
3            0.866133       1.436221     0.131944          0.224399  
4            0.859285       1.433140     0.183095          0.213446  


In [32]:
# Define the variables for GMM model based on the assignment
# y_var represents the dependent variable, while x_vars and iv_vars are explanatory and instrumental variables
y_var = np.array(data_part1["Stock Change"])
x_vars = np.array(data_part1[["Inventory Turnover", "Operating Profit", "Interaction Effect"]])
iv_vars = np.array(data_part1[["Current Ratio", "Quick Ratio", "Debt Asset Ratio"]])


In [35]:
class CustomGMMWithDelta(GMM):
    def momcond(self, params):
        # Unpacking parameters and including delta for the bias term
        coef_a, coef_b, coef_c, coef_d, delta = params

        # Use self.endog directly for the dependent variable
        x_data = self.exog  # Explanatory variables
        z_data = self.instrument  # Instrumental variables

        # Calculate residual based on linear combination of parameters
        residual = self.endog - (coef_a + coef_b * x_data[:, 0] + coef_c * x_data[:, 1] + coef_d * x_data[:, 2])

        # Defining moment conditions for GMM
        g0 = residual
        g1 = residual * x_data[:, 0]  # Moment for Inventory Turnover
        g2 = residual * x_data[:, 1]  # Moment for Operating Profit
        g3 = residual * x_data[:, 2]  # Moment for Interaction Effect
        g4 = residual * z_data[:, 0] + delta  # Moment for Current Ratio + delta
        g5 = residual * z_data[:, 1] + delta  # Moment for Quick Ratio + delta
        g6 = residual * z_data[:, 2] + delta  # Moment for Debt Asset Ratio + delta

        # Stack all moments into a single array
        return np.column_stack((g0, g1, g2, g3, g4, g5, g6))

In [36]:
# Initial parameter estimates, including delta
initial_params = np.array([0.1, 0.1, 0.1, 0.1, 0.1])

In [38]:
# Create an instance of the custom GMM model and fit it
gmm_model = CustomGMMWithDelta(endog=y_var, exog=x_vars, instrument=iv_vars, k_moms=7, k_params=5)
gmm_results = gmm_model.fit(initial_params)

Optimization terminated successfully.
         Current function value: 0.000095
         Iterations: 9
         Function evaluations: 14
         Gradient evaluations: 14
Optimization terminated successfully.
         Current function value: 0.001753
         Iterations: 8
         Function evaluations: 11
         Gradient evaluations: 11
Optimization terminated successfully.
         Current function value: 0.001735
         Iterations: 7
         Function evaluations: 12
         Gradient evaluations: 12
Optimization terminated successfully.
         Current function value: 0.001735
         Iterations: 5
         Function evaluations: 9
         Gradient evaluations: 9
Optimization terminated successfully.
         Current function value: 0.001735
         Iterations: 0
         Function evaluations: 1
         Gradient evaluations: 1


In [44]:
# Display the full GMM results summary
print(gmm_results.summary())

                          CustomGMMWithDelta Results                          
Dep. Variable:                      y   Hansen J:                        2.942
Model:             CustomGMMWithDelta   Prob (Hansen J):                 0.230
Method:                           GMM                                         
Date:                Mon, 11 Nov 2024                                         
Time:                        09:42:25                                         
No. Observations:                1696                                         
                 coef    std err          z      P>|z|      [0.025      0.975]
------------------------------------------------------------------------------
p 0           -0.0011      0.017     -0.063      0.949      -0.034       0.032
p 1            0.0004      0.000      1.001      0.317      -0.000       0.001
p 2           -0.1191      0.031     -3.860      0.000      -0.180      -0.059
p 3            0.0014      0.000      3.640      0.0

In [39]:
# Extract delta's coefficient and p-value to analyze the bias claim
delta_position = 4  # Index of delta in the parameter array
delta_coefficient = gmm_results.params[delta_position]
delta_p_value = gmm_results.pvalues[delta_position]

In [40]:
print("Delta coefficient:", delta_coefficient)
print("Delta p-value:", delta_p_value)

Delta coefficient: -0.0003478821393261296
Delta p-value: 0.8951775822184709


In [41]:
# Set the significance level and interpret the results
alpha = 0.05
if delta_p_value < alpha:
    print("The claim about the bias term (delta) is statistically supported.")
else:
    print("No significant evidence to support the bias term claim (delta).")


No significant evidence to support the bias term claim (delta).


# Part 2:

In [45]:
# Import necessary libraries
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import OrdinalEncoder, OneHotEncoder
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import confusion_matrix, precision_score, recall_score, f1_score, classification_report


In [47]:
# Load the credit dataset
# Upload the file
uploaded2 = files.upload()

Saving midterm_parttwo.csv to midterm_parttwo.csv


In [48]:
# Get the filename and file contents
filename = list(uploaded2.keys())[0]
file_content = uploaded2[filename]

# Read the CSV file into a pandas DataFrame
data_part2 = pd.read_csv(io.BytesIO(file_content)) # Use io.BytesIO to handle the bytes data

# Check if data loaded correctly
print(data_part2.head())

   Years of Education after High School Requested Credit Amount  \
0                                     1                     Low   
1                                     2                     Low   
2                                     1                     Low   
3                                     3                     Low   
4                                     3                     Low   

  Number of Dependents Monthly Income Monthly Expense Marital Status  \
0         No dependent       Very low        Very low        Married   
1         No dependent       Very low        Very low         Single   
2         No dependent       Very low        Very low         Single   
3         No dependent       Very low        Very low        Married   
4         No dependent       Very low        Very low         Single   

  Credit Rating  
0      Positive  
1      Positive  
2      Positive  
3      Positive  
4      Negative  


In [49]:
# Encode the dependent variable (Credit Rating) to binary
data_part2['Credit Rating'] = data_part2['Credit Rating'].map({'Positive': 1, 'Negative': 0})


In [50]:
# Identify categorical and nominal columns for encoding
categorical_feats = ['Requested Credit Amount', 'Number of Dependents', 'Monthly Income', 'Monthly Expense']
numeric_feats = ['Years of Education after High School']
nominal_feat = 'Marital Status'

In [51]:
# One-Hot Encoding for nominal categorical variable (Marital Status)
onehot = OneHotEncoder(sparse_output=False, drop='first')
marital_status_encoded = onehot.fit_transform(data_part2[[nominal_feat]])


In [52]:
# Add marital status dummy variables to the main dataset
marital_status_dummies = pd.DataFrame(marital_status_encoded, columns=onehot.get_feature_names_out([nominal_feat]))
data_part2 = pd.concat([data_part2.drop(columns=[nominal_feat]), marital_status_dummies], axis=1)


In [53]:
# Ordinal Encoding for ordinal categorical features
ordinal = OrdinalEncoder(categories=[['Low', 'Medium', 'High'],
                                     ['No dependent', 'Less than 2', 'More than 2'],
                                     ['Very low', 'Low', 'Moderate', 'High', 'Very High'],
                                     ['Very low', 'Low', 'Moderate', 'High', 'Very high']])
data_part2[categorical_feats] = ordinal.fit_transform(data_part2[categorical_feats])


In [54]:
# Split data into train and test sets
X = data_part2.drop('Credit Rating', axis=1)
y = data_part2['Credit Rating']
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.5, random_state=42)


In [55]:
# Train logistic regression model
logit_model = LogisticRegression()
logit_model.fit(X_train, y_train)


In [56]:
# Evaluate the model with standard threshold
y_pred = logit_model.predict(X_test)
conf_matrix = confusion_matrix(y_test, y_pred)
classification_rep = classification_report(y_test, y_pred)

print('Original Model Evaluation')
print(conf_matrix)
print(classification_rep)

Original Model Evaluation
[[   0  577]
 [   0 3464]]
              precision    recall  f1-score   support

           0       0.00      0.00      0.00       577
           1       0.86      1.00      0.92      3464

    accuracy                           0.86      4041
   macro avg       0.43      0.50      0.46      4041
weighted avg       0.73      0.86      0.79      4041



  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


In [57]:
# Adjust threshold to achieve a 15% approval rate
y_probs = logit_model.predict_proba(X_test)[:, 1]  # Probability estimates for positive class
approval_threshold = np.percentile(y_probs, 85)  # 85th percentile threshold to keep 15% approvals
y_pred_adjusted = (y_probs >= approval_threshold).astype(int)


In [58]:
# Evaluate model with adjusted threshold
conf_matrix_adjusted = confusion_matrix(y_test, y_pred_adjusted)
classification_rep_adjusted = classification_report(y_test, y_pred_adjusted)

print('New Model with 15% Threshold')
print(conf_matrix_adjusted)
print(classification_rep_adjusted)

New Model with 15% Threshold
[[ 492   85]
 [2929  535]]
              precision    recall  f1-score   support

           0       0.14      0.85      0.25       577
           1       0.86      0.15      0.26      3464

    accuracy                           0.25      4041
   macro avg       0.50      0.50      0.25      4041
weighted avg       0.76      0.25      0.26      4041

