In [35]:
import pandas as pd

df = pd.read_csv('WVS_Cross-National_Wave_7_csv_v5_0.csv')
# Checking for missing values
print(df.isnull().sum())
# Removing rows with missing values
df_cleaned = df.dropna()




version             0
doi                 0
A_WAVE              0
A_YEAR              0
A_STUDY             0
                ...  
v2psprbrch      90360
v2psprlnks      90360
v2psplats       90360
v2xnp_client    90360
v2xps_party     90360
Length: 606, dtype: int64


In [7]:
# Calculating mean, median, and standard deviation
mean_wellbeing = df_cleaned['Q49'].mean()
median_wellbeing = df_cleaned['Q49'].median()
std_wellbeing = df_cleaned['Q49'].std()

print(f"Mean: {mean_wellbeing}, Median: {median_wellbeing}, Std Dev: {std_wellbeing}")


Mean: 7.573263558515699, Median: 8.0, Std Dev: 1.902954616944322


In [9]:


# Defining custom bins and labels
bins = [1, 3, 7, 10]
labels = ['Low', 'Medium', 'High']

# List of variables you want to group
likert_variables = ['Q49', 'Q288', 'Q164','Q250','Q287']

# Apply the grouping for each specified variable
for var in likert_variables:
    df[f'{var}_group'] = pd.cut(df[var], bins=bins, labels=labels, include_lowest=True)

# Display the result
print(df[[f'{var}_group' for var in likert_variables]])


      Q49_group Q288_group Q164_group Q250_group Q287_group
0          High     Medium     Medium       High        Low
1          High       High        Low       High        Low
2          High     Medium       High       High        Low
3          High     Medium        Low     Medium     Medium
4        Medium     Medium     Medium       High     Medium
...         ...        ...        ...        ...        ...
94273      High        NaN       High       High        NaN
94274      High     Medium        Low     Medium        NaN
94275      High     Medium       High     Medium        NaN
94276    Medium       High     Medium        NaN        NaN
94277      High     Medium     Medium       High        NaN

[94278 rows x 5 columns]


In [39]:
likert_variable_new = 'Q275'

# Defining custom bins and labels for the new variable
bins_new = [0, 5, 6, 8]
labels_new = ['Low', 'Medium', 'High']

# Creating a new categorical variable based on the new Likert scale grouping
df['Q275'] = pd.cut(df_cleaned[likert_variable_new], bins=bins_new, labels=labels_new, include_lowest=True)

# Display the result for the new variable
print(df_cleaned[[likert_variable_new, 'Q275']])

      Q275  Q275
1007     3     3
1010     3     3
1014     3     3
1015     6     6
1016     3     3
...    ...   ...
6020     7     7
6102     5     5
6159     6     6
6181     7     7
6201     2     2

[2102 rows x 2 columns]


In [14]:
expected_answers = df['Q275_group']
print("Expected Answers for Q275:")
print(expected_answers)

Expected Answers for Q275:
0        NaN
1        NaN
2        NaN
3        NaN
4        NaN
        ... 
94273    NaN
94274    NaN
94275    NaN
94276    NaN
94277    NaN
Name: Q275_group, Length: 94278, dtype: category
Categories (3, object): ['Low' < 'Medium' < 'High']


In [25]:
# test for multicollinearity

import statsmodels.api as sm
from statsmodels.stats.outliers_influence import variance_inflation_factor




selected_columns = ['Q49', 'Q288','Q275', 'Q250', 'Q164', 'Q287', 'Q262', 'Q260','Q273','Q45', 'Q47']
df_selected = df_cleaned[selected_columns]


X = sm.add_constant(df_selected)


vif_data = pd.DataFrame()
vif_data["Variable"] = X.columns
vif_data["VIF"] = [variance_inflation_factor(X.values, i) for i in range(X.shape[1])]


print(vif_data)


   Variable         VIF
0     const  110.731982
1       Q49    1.180923
2      Q288    1.264284
3      Q275    1.399039
4      Q250    1.107424
5      Q164    1.233572
6      Q287    1.313162
7      Q262    1.153807
8      Q260    1.031062
9      Q273    1.074866
10      Q45    1.104590
11      Q47    1.139999


  x = pd.concat(x[::order], 1)


In [17]:
from scipy.stats import spearmanr


target_variable = 'Q49'
other_variables = ['Q288', 'Q275', 'Q250', 'Q164', 'Q287', 'Q262', 'Q260','Q273','Q45', 'Q47']

correlation_results = {}
for var in other_variables:
    corr, p_value = spearmanr(df[target_variable], df[var])
    correlation_results[var] = {'correlation': corr, 'p-value': p_value}
    
for var, values in correlation_results.items():
    print(f"Spearman Correlation between {target_variable} and {var}:")
    print(f"Correlation Coefficient: {values['correlation']}")
    print(f"P-value: {values['p-value']}")
    print("\n")

Spearman Correlation between Q49 and Q288:
Correlation Coefficient: 0.17139698656237717
P-value: 0.0


Spearman Correlation between Q49 and Q275:
Correlation Coefficient: nan
P-value: nan


Spearman Correlation between Q49 and Q250:
Correlation Coefficient: 0.1538938189385846
P-value: 0.0


Spearman Correlation between Q49 and Q164:
Correlation Coefficient: 0.12690842517289913
P-value: 0.0


Spearman Correlation between Q49 and Q287:
Correlation Coefficient: -0.1262742450991426
P-value: 0.0


Spearman Correlation between Q49 and Q262:
Correlation Coefficient: 0.018773211989478577
P-value: 8.180601378468564e-09


Spearman Correlation between Q49 and Q260:
Correlation Coefficient: 0.010666831111803584
P-value: 0.0010556078327589146


Spearman Correlation between Q49 and Q273:
Correlation Coefficient: -0.04813923903157425
P-value: 1.714808985544932e-49


Spearman Correlation between Q49 and Q45:
Correlation Coefficient: -0.087731409726767
P-value: 1.9693847048885364e-160


Spearman Correl

In [18]:
from sklearn.model_selection import train_test_split
import statsmodels.api as sm
from sklearn.metrics import accuracy_score
# Setting up X (independent variables) and Y (dependent variable)
X = df_cleaned [['Q288', 'Q275', 'Q250', 'Q164', 'Q287', 'Q262', 'Q260','Q273','Q45','Q47']]
Y = df_cleaned ['Q49']

X = sm.add_constant(X)

# Split the data into training and testing sets
X_train, X_test, Y_train, Y_test = train_test_split(X, Y, test_size=0.2, random_state=42)

# Creating and fitting the ordered logistic regression model
model = sm.OLS(Y_train, X_train).fit()

# Prediction on the test set
Y_pred = model.predict(X_test)

# Converting predicted values to integer (since Y is ordinal)
Y_pred = Y_pred.round().astype(int)

# Evaluating accuracy
accuracy = accuracy_score(Y_test, Y_pred)
print(f"Accuracy: {accuracy:.2f}")

# Displaying model summary
print(model.summary())

Accuracy: 0.27
                            OLS Regression Results                            
Dep. Variable:                    Q49   R-squared:                       0.146
Model:                            OLS   Adj. R-squared:                  0.141
Method:                 Least Squares   F-statistic:                     28.47
Date:                Tue, 23 Jan 2024   Prob (F-statistic):           7.83e-51
Time:                        21:26:39   Log-Likelihood:                -3316.5
No. Observations:                1681   AIC:                             6655.
Df Residuals:                    1670   BIC:                             6715.
Df Model:                          10                                         
Covariance Type:            nonrobust                                         
                 coef    std err          t      P>|t|      [0.025      0.975]
------------------------------------------------------------------------------
const          7.2754      0.420     

  x = pd.concat(x[::order], 1)
