In [398]:
import pandas as pd
import statsmodels.api as sm
from sklearn.preprocessing import StandardScaler

In [399]:
df = pd.read_csv('source/FReDA4.csv')
# df.info()

In [400]:
df2 = df.copy().dropna()
# df2.info()

In [401]:
df2 = df2[df2["Group3"] == "Couple Mixed"]

In [402]:
df2 = (
    df2.groupby("CoupleId")
    .filter(lambda g: {"Anchor", "Partner"}.issubset(set(g["Role"])))
)

In [403]:
all_factors = [
    'Sex',
    'Age',
    'Work Status',
    'Education',
    'East',
    'Urbanization',

    'Extraversion',
    'Agreeableness',
    'Conscientiousness',
    'Openness',
    'Neuroticism',
    'Conservatism',
    'Religiosity',

    'Depressiveness',
    'Loneliness',
    'Self-esteem',
    'Life Satisfaction',
    'Health',

    'Relationship Sex',
    'Relationship Length',
    'Age difference',
    'Married',
    'Cohabitation',
    'Kids',
    'Communication Quality',
    'Relationship Satisfaction',
    'Conflict Management',
]

In [404]:
X = df2[all_factors]
y = df2["Divergence"]

# scaler = StandardScaler()
# X_scaled = scaler.fit_transform(X)
#
# X = pd.DataFrame(X_scaled, columns=all_factors, index=X.index)

In [405]:
# from sklearn.model_selection import KFold
# from sklearn.linear_model import LinearRegression
# from sklearn.metrics import r2_score
# import numpy as np
#
# kf = KFold(n_splits=4, shuffle=True)
#
# cv_scores = []
#
# for train_index, test_index in kf.split(X):
#     # Split the data into training and test sets
#     X_train, X_test = X.iloc[train_index], X.iloc[test_index]
#     y_train, y_test = y.iloc[train_index], y.iloc[test_index]
#
#     # Fit a linear regression model
#     model = LinearRegression().fit(X_train, y_train)
#
#     # Make predictions on the test set
#     y_pred = model.predict(X_test)
#
#     # Calculate the R^2 score
#     score = r2_score(y_test, y_pred)
#
#     # Append the score to the list of cross-validation scores
#     cv_scores.append(score)
#
# # Calculate the mean and standard deviation of the cross-validation scores
# mean_score = np.mean(cv_scores)
# std_score = np.std(cv_scores)
#
# print(f"Cross-validation scores: {cv_scores}")
# print(f"Mean R^2 score: {mean_score:.3f}")
# print(f"Standard deviation of R^2 scores: {std_score:.3f}")

In [406]:
def stepwise_selection(X, y,
                       threshold_in=0.15,
                       threshold_out=0.15):
    included = []
    while True:
        changed = False
        # ---------- FORWARD STEP ----------
        excluded = list(set(X.columns) - set(included))
        new_pvals = pd.Series(index=excluded, dtype=float)

        for feature in excluded:
            model = sm.OLS(y, sm.add_constant(X[included + [feature]])).fit()
            new_pvals[feature] = model.pvalues[feature]

        if not new_pvals.empty:
            best_feature = new_pvals.idxmin()
            # print(f"Best feature: {best_feature}", new_pvals)
            best_pval = new_pvals.min()

            if best_pval < threshold_in:
                included.append(best_feature)
                changed = True

        # ---------- BACKWARD STEP ----------
        if included:
            model = sm.OLS(y, sm.add_constant(X[included])).fit()
            pvals = model.pvalues.iloc[1:]  # drop the intercept

            worst_feature = pvals.idxmax()
            worst_pval = pvals.max()

            if worst_pval > threshold_out:
                included.remove(worst_feature)
                changed = True

        if not changed:
            break

    return included


bidirectional_selected_features = stepwise_selection(X, y)
print("Selected features using Bidirectional Elimination: ", bidirectional_selected_features)

Selected features using Bidirectional Elimination:  ['Loneliness', 'Communication Quality', 'Agreeableness', 'Religiosity', 'Conflict Management', 'Self-esteem']


In [407]:
final_model = sm.OLS(y, sm.add_constant(X[bidirectional_selected_features])).fit()
# print(final_model.summary())

In [408]:
from io import StringIO

results_as_html = final_model.summary().tables[1].as_html()
pd.read_html(StringIO(results_as_html), header=0, index_col=0)[0]

Unnamed: 0,coef,std err,t,P>|t|,[0.025,0.975]
const,-1.0871,0.635,-1.711,0.088,-2.336,0.162
Loneliness,0.2355,0.061,3.851,0.0,0.115,0.356
Communication Quality,-0.0778,0.021,-3.675,0.0,-0.119,-0.036
Agreeableness,0.0512,0.029,1.782,0.076,-0.005,0.108
Religiosity,0.0436,0.019,2.315,0.021,0.007,0.081
Conflict Management,0.0439,0.024,1.856,0.064,-0.003,0.09
Self-esteem,0.0537,0.029,1.848,0.065,-0.003,0.111
