In [780]:
import pandas as pd
import statsmodels.api as sm
from sklearn.preprocessing import StandardScaler

In [781]:
df = pd.read_csv('source/FReDA4.csv')
# df.info()

In [782]:
df2 = df.copy().dropna()
# df2.info()

In [783]:
df2 = df2[df2["Group3"] == "Couple Mixed"]

In [784]:
df2 = (
    df2.groupby("CoupleId")
    .filter(lambda g: {"Anchor", "Partner"}.issubset(set(g["Role"])))
)

In [785]:
all_factors = [
    'Sex',
    'Age',
    'Work Status',
    'Education',
    # # 'East',
    # # 'Urbanization',
    'Extraversion',
    'Agreeableness',
    'Conscientiousness',
    'Openness',
    'Neuroticism',
    # 'Conservatism',
    # 'Religiosity',
    'Depressiveness',
    'Loneliness',
    'Self-esteem',
    'Life Satisfaction',
    'Health',
    'Relationship Sex',
    'Relationship Length',
    'Age difference',
    'Married',
    'Cohabitation',
    'Kids',
    'Communication Quality',
    'Relationship Satisfaction',
    'Conflict Management',
]

In [786]:
X = df2[all_factors]
y = df2["Frequency of Touch"]

scaler = StandardScaler()
X_scaled = scaler.fit_transform(X)

X = pd.DataFrame(X_scaled, columns=all_factors, index=X.index)

In [787]:
from sklearn.model_selection import KFold
from sklearn.linear_model import LinearRegression
from sklearn.metrics import r2_score
import numpy as np

kf = KFold(n_splits=4, shuffle=True)

cv_scores = []

for train_index, test_index in kf.split(X):
    # Split the data into training and test sets
    X_train, X_test = X.iloc[train_index], X.iloc[test_index]
    y_train, y_test = y.iloc[train_index], y.iloc[test_index]

    # Fit a linear regression model
    model = LinearRegression().fit(X_train, y_train)

    # Make predictions on the test set
    y_pred = model.predict(X_test)

    # Calculate the R^2 score
    score = r2_score(y_test, y_pred)

    # Append the score to the list of cross-validation scores
    cv_scores.append(score)

# Calculate the mean and standard deviation of the cross-validation scores
mean_score = np.mean(cv_scores)
std_score = np.std(cv_scores)

print(f"Cross-validation scores: {cv_scores}")
print(f"Mean R^2 score: {mean_score:.3f}")
print(f"Standard deviation of R^2 scores: {std_score:.3f}")

Cross-validation scores: [0.31146709504487624, 0.2535048430067598, 0.369483647529682, 0.3745495919553651]
Mean R^2 score: 0.327
Standard deviation of R^2 scores: 0.049


In [788]:
def stepwise_selection(X, y,
                       threshold_in=0.15,
                       threshold_out=0.15):
    included = []
    while True:
        changed = False
        # ---------- FORWARD STEP ----------
        excluded = list(set(X.columns) - set(included))
        new_pvals = pd.Series(index=excluded, dtype=float)

        for feature in excluded:
            model = sm.OLS(y, sm.add_constant(X[included + [feature]])).fit()
            new_pvals[feature] = model.pvalues[feature]

        if not new_pvals.empty:
            best_feature = new_pvals.idxmin()
            # print(f"Best feature: {best_feature}", new_pvals)
            best_pval = new_pvals.min()

            if best_pval < threshold_in:
                included.append(best_feature)
                changed = True

        # ---------- BACKWARD STEP ----------
        if included:
            model = sm.OLS(y, sm.add_constant(X[included])).fit()
            pvals = model.pvalues.iloc[1:]  # drop the intercept

            worst_feature = pvals.idxmax()
            worst_pval = pvals.max()

            if worst_pval > threshold_out:
                included.remove(worst_feature)
                changed = True

        if not changed:
            break

    return included


bidirectional_selected_features = stepwise_selection(X, y)
print("Selected features using Bidirectional Elimination: ", bidirectional_selected_features)

Selected features using Bidirectional Elimination:  ['Communication Quality', 'Relationship Satisfaction', 'Relationship Length', 'Work Status', 'Depressiveness', 'Openness', 'Relationship Sex', 'Loneliness']


In [789]:
final_model = sm.OLS(y, sm.add_constant(X[bidirectional_selected_features])).fit()
# print(final_model.summary())

In [790]:
from io import StringIO

results_as_html = final_model.summary().tables[1].as_html()
pd.read_html(StringIO(results_as_html), header=0, index_col=0)[0]

Unnamed: 0,coef,std err,t,P>|t|,[0.025,0.975]
const,2.481,0.048,51.787,0.0,2.387,2.575
Communication Quality,0.3222,0.061,5.267,0.0,0.202,0.443
Relationship Satisfaction,0.4006,0.065,6.122,0.0,0.272,0.529
Relationship Length,-0.234,0.051,-4.573,0.0,-0.335,-0.133
Work Status,0.1069,0.049,2.17,0.031,0.01,0.204
Depressiveness,0.1556,0.057,2.749,0.006,0.044,0.267
Openness,0.1026,0.049,2.093,0.037,0.006,0.199
Relationship Sex,-0.1027,0.049,-2.103,0.036,-0.199,-0.007
Loneliness,-0.0906,0.06,-1.518,0.13,-0.208,0.027
