# Dependecies


In [1]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split, cross_val_score, StratifiedKFold
from sklearn.preprocessing import OneHotEncoder, LabelEncoder
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix, roc_auc_score
from sklearn.impute import SimpleImputer
from sklearn.pipeline import Pipeline
# from imblearn.over_sampling import SMOTE
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.svm import SVC

# OverSampling

In [2]:
!pip install ctgan

Collecting ctgan
  Downloading ctgan-0.11.0-py3-none-any.whl.metadata (10 kB)
Collecting rdt>=1.14.0 (from ctgan)
  Downloading rdt-1.15.0-py3-none-any.whl.metadata (10 kB)
Collecting Faker>=17 (from rdt>=1.14.0->ctgan)
  Downloading faker-37.0.0-py3-none-any.whl.metadata (15 kB)
Collecting nvidia-cuda-nvrtc-cu12==12.4.127 (from torch>=2.0.0->ctgan)
  Downloading nvidia_cuda_nvrtc_cu12-12.4.127-py3-none-manylinux2014_x86_64.whl.metadata (1.5 kB)
Collecting nvidia-cuda-runtime-cu12==12.4.127 (from torch>=2.0.0->ctgan)
  Downloading nvidia_cuda_runtime_cu12-12.4.127-py3-none-manylinux2014_x86_64.whl.metadata (1.5 kB)
Collecting nvidia-cuda-cupti-cu12==12.4.127 (from torch>=2.0.0->ctgan)
  Downloading nvidia_cuda_cupti_cu12-12.4.127-py3-none-manylinux2014_x86_64.whl.metadata (1.6 kB)
Collecting nvidia-cudnn-cu12==9.1.0.70 (from torch>=2.0.0->ctgan)
  Downloading nvidia_cudnn_cu12-9.1.0.70-py3-none-manylinux2014_x86_64.whl.metadata (1.6 kB)
Collecting nvidia-cublas-cu12==12.4.5.8 (from tor

In [3]:
from ctgan import CTGAN

def generate_synthetic_data(filepath, num_samples=70, epochs=50):
    # Load the original dataset
    real_data = pd.read_csv(filepath)

    # Identify discrete (categorical) columns
    discrete_columns = real_data.columns.tolist()  # All columns are categorical

    # Train CTGAN
    ctgan = CTGAN(
        epochs=epochs,  # Increase epochs for better fidelity
        verbose=True
    )
    ctgan.fit(real_data, discrete_columns)

    # Generate synthetic data
    synthetic_data = ctgan.sample(num_samples)

    return synthetic_data

# Generate synthetic data
df = generate_synthetic_data("/content/qa-data-eng-analysis.csv")

Gen. (1.01) | Discrim. (0.03): 100%|██████████| 50/50 [00:03<00:00, 14.06it/s]


In [4]:
df = pd.read_csv('/content/qa-data-eng-analysis.csv')

# Data Preprocessing

In [5]:
df.describe()

Unnamed: 0,News-Source,Has-Nightmare,News-Preference,News-Trust,Emotional-Response,Shares-News,Dwelling-On-News,Fear-Reaction
count,75,75,75,75,75,75,75,75
unique,5,3,3,3,4,3,3,4
top,phone,Yes,No,Yes,Very Fear,Yes,OverThink,Talking with Dad
freq,39,50,33,38,25,35,30,26


In [6]:
df = df.replace('UnKown', np.nan)
df = df.replace('Normal', np.nan)

In [7]:
def handleNan_NewsSource(df):
    # Impute with the most frequent value
    df['News-Source'] = df['News-Source'].fillna(df['News-Source'].mode()[0])
    return df

def handleNan_EmotionalResponse(df):
    # Impute with the most frequent value
    df['Emotional-Response'] = df['Emotional-Response'].fillna(df['Emotional-Response'].mode()[0])
    return df

def handleNan_HasNightmare(df):
    # Fill NaN values: 70% YES, 30% NO randomly
    mask = df['Has-Nightmare'].isna()
    n_nan = mask.sum()
    if n_nan > 0:
        df.loc[mask, 'Has-Nightmare'] = np.random.choice(['Yes', 'No'], size=n_nan, p=[0.7, 0.3])
    return df

def handleNan_NewsPreference(df):
    # Impute with the most frequent value
    df['News-Preference'] = df['News-Preference'].fillna(df['News-Preference'].mode()[0])
    return df

def handleNan_NewsTrust(df):
    # Impute with the most frequent value
    df['News-Trust'] = df['News-Trust'].fillna(df['News-Trust'].mode()[0])
    return df

def handleNan_SharesNews(df):
    # Impute with the most frequent value
    df['Shares-News'] = df['Shares-News'].fillna(df['Shares-News'].mode()[0])
    return df

def handleNan_DwellingOnNews(df):
    # Impute with the most frequent value
    df['Dwelling-On-News'] = df['Dwelling-On-News'].fillna(df['Dwelling-On-News'].mode()[0])
    return df

def handleNan_FearReaction(df):
    # Impute with the most frequent value
    df['Fear-Reaction'] = df['Fear-Reaction'].fillna(df['Fear-Reaction'].mode()[0])
    return df

def handleNanValues(df):
    df = handleNan_NewsSource(df)
    if 'Has-Nightmare' in df.columns:
        df = handleNan_HasNightmare(df)
    df = handleNan_NewsPreference(df)
    df = handleNan_NewsTrust(df)
    df = handleNan_SharesNews(df)
    df = handleNan_DwellingOnNews(df)
    df = handleNan_FearReaction(df)
    df = handleNan_EmotionalResponse(df)
    return df

In [8]:
df = handleNanValues(df)

In [9]:
df.head()

Unnamed: 0,News-Source,Has-Nightmare,News-Preference,News-Trust,Emotional-Response,Shares-News,Dwelling-On-News,Fear-Reaction
0,Tv,No,No,Yes,Very Fear,No,Thinking,Sharing Immediately
1,Tv,No,Yes,Yes,Sad,Yes,OverThink,Talking with Dad
2,Tv,Yes,No,No,Very Fear,No,Thinking,Trying To Forget
3,Family,No,Yes,Yes,Very Fear,No,Thinking,Sharing Immediately
4,phone,Yes,No,Yes,Sad,No,OverThink,Talking with Dad


In [10]:
df.describe()

Unnamed: 0,News-Source,Has-Nightmare,News-Preference,News-Trust,Emotional-Response,Shares-News,Dwelling-On-News,Fear-Reaction
count,75,75,75,75,75,75,75,75
unique,4,2,2,2,3,2,3,4
top,phone,Yes,No,Yes,Very Fear,Yes,OverThink,Talking with Dad
freq,40,55,46,42,43,44,30,26


In [11]:
#unique values of Emotional-Response
df['Emotional-Response'].unique()

array(['Very Fear', 'Sad', 'Little Fear'], dtype=object)

In [12]:
df = df.dropna(subset=['Has-Nightmare'])

In [13]:
df.describe()

Unnamed: 0,News-Source,Has-Nightmare,News-Preference,News-Trust,Emotional-Response,Shares-News,Dwelling-On-News,Fear-Reaction
count,75,75,75,75,75,75,75,75
unique,4,2,2,2,3,2,3,4
top,phone,Yes,No,Yes,Very Fear,Yes,OverThink,Talking with Dad
freq,40,55,46,42,43,44,30,26


In [14]:
def apply_encoding(df):
    # Binary columns mapping
    binary_cols = ['News-Preference', 'News-Trust', 'Shares-News']
    binary_map = {'Yes': 1, 'No': 0}
    df[binary_cols] = df[binary_cols].replace(binary_map).astype(int)  # Fix: Add .astype(int)

    # Ordinal mappings for categorical columns
    emotional_response_map = {'Sad': 1, 'Little Fear': 2, 'Verry Fear': 3}  # Adjusted mapping
    dwelling_on_news = {'NoThinking': 1, 'Thinking': 2, 'OverThink': 3}

    # Map Emotional-Response and fill NaN with mode
    df['Emotional-Response'] = df['Emotional-Response'].map(emotional_response_map)
    df['Emotional-Response'] = df['Emotional-Response'].fillna(df['Emotional-Response'].mode()[0])

    df['Dwelling-On-News'] = df['Dwelling-On-News'].map(dwelling_on_news)

    # One-hot encoding with 0/1 instead of True/False
    news_source_dummies = pd.get_dummies(df['News-Source'], prefix='NewsSource').astype(int)
    fear_reaction_dummies = pd.get_dummies(df['Fear-Reaction'], prefix='FearReaction').astype(int)

    # Drop original columns and concatenate new dummy columns
    df = df.drop(['News-Source', 'Fear-Reaction'], axis=1)
    df = pd.concat([df, news_source_dummies, fear_reaction_dummies], axis=1)

    return df

In [15]:
df = apply_encoding(df)

  df[binary_cols] = df[binary_cols].replace(binary_map).astype(int)  # Fix: Add .astype(int)


In [16]:
# unique values for all columns
for column in df.columns:
    unique_values = df[column].unique()
    print(f"Unique values for column '{column}': {unique_values}")

Unique values for column 'Has-Nightmare': ['No' 'Yes']
Unique values for column 'News-Preference': [0 1]
Unique values for column 'News-Trust': [1 0]
Unique values for column 'Emotional-Response': [2. 1.]
Unique values for column 'Shares-News': [0 1]
Unique values for column 'Dwelling-On-News': [2 3 1]
Unique values for column 'NewsSource_Family': [0 1]
Unique values for column 'NewsSource_School': [0 1]
Unique values for column 'NewsSource_Tv': [1 0]
Unique values for column 'NewsSource_phone': [0 1]
Unique values for column 'FearReaction_Nothing': [0 1]
Unique values for column 'FearReaction_Sharing Immediately': [1 0]
Unique values for column 'FearReaction_Talking with Dad': [0 1]
Unique values for column 'FearReaction_Trying To Forget': [0 1]


In [17]:
y = df['Has-Nightmare'].map({'Yes': 1, 'No': 0})
X = df.drop('Has-Nightmare', axis=1)

# Model Training

In [18]:
# Split data into train/test sets
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42
)

In [19]:
models = {
    "Logistic Regression": LogisticRegression(class_weight="balanced", random_state=42),
    "Random Forest": RandomForestClassifier(class_weight="balanced", random_state=42),
    "Support Vector Machine": SVC(class_weight="balanced", probability=True, random_state=42)
}

In [20]:
results = {}

for name, model in models.items():
    # Train the model
    model.fit(X_train, y_train)

    # Predictions
    y_pred = model.predict(X_test)
    y_proba = model.predict_proba(X_test)[:, 1]  # Probabilities for ROC-AUC

    # Metrics
    report = classification_report(y_test, y_pred, output_dict=True)
    roc_auc = roc_auc_score(y_test, y_proba)
    cm = confusion_matrix(y_test, y_pred)

    # Store results
    results[name] = {
        "Accuracy": report["accuracy"],
        "Precision (Class 1)": report["1"]["precision"],
        "Recall (Class 1)": report["1"]["recall"],
        "F1-Score (Class 1)": report["1"]["f1-score"],
        "ROC-AUC": roc_auc,
        "Confusion Matrix": cm
    }

# Model Metrics Evaluation

In [21]:
results_df = pd.DataFrame(results).T
print(results_df[["Accuracy", "F1-Score (Class 1)", "ROC-AUC"]])

                        Accuracy F1-Score (Class 1)   ROC-AUC
Logistic Regression     0.533333           0.588235  0.611111
Random Forest                0.6           0.727273  0.583333
Support Vector Machine       0.6                0.7  0.583333


# Hyperparameter Tuning

In [22]:
from sklearn.model_selection import GridSearchCV

# Define hyperparameter grid
param_grid = {
    'n_estimators': [50, 100, 200],
    'max_depth': [None, 10, 20],
    'min_samples_split': [2, 5, 10],
    'class_weight': ['balanced']
}

# Perform Grid Search with cross-validation
grid_search = GridSearchCV(
    RandomForestClassifier(random_state=42),
    param_grid,
    cv=5,
    scoring='f1'  # Prioritize F1-score for the minority class
)
grid_search.fit(X_train, y_train)

# Get the best model
best_rf = grid_search.best_estimator_
print("Best Parameters:", grid_search.best_params_)

Best Parameters: {'class_weight': 'balanced', 'max_depth': None, 'min_samples_split': 5, 'n_estimators': 200}


In [23]:
from sklearn.model_selection import cross_val_score

# Cross-validate F1-score
cv_scores = cross_val_score(best_rf, X, y, cv=5, scoring='f1')
print(f"Cross-Validated F1-Scores: {cv_scores}")
print(f"Mean F1: {cv_scores.mean():.2f} (±{cv_scores.std():.2f})")

Cross-Validated F1-Scores: [0.83333333 0.86956522 0.7826087  0.72727273 0.76190476]
Mean F1: 0.79 (±0.05)


In [24]:
from sklearn.metrics import confusion_matrix, precision_score, recall_score

y_pred = best_rf.predict(X_test)

# Precision and Recall
precision = precision_score(y_test, y_pred, pos_label=1)
recall = recall_score(y_test, y_pred, pos_label=1)
print(f"Precision (Class 1): {precision:.2f}")
print(f"Recall (Class 1): {recall:.2f}")

# Confusion Matrix
cm = confusion_matrix(y_test, y_pred)
print("Confusion Matrix:\n", cm)

Precision (Class 1): 0.89
Recall (Class 1): 0.67
Confusion Matrix:
 [[2 1]
 [4 8]]


In [25]:
def encode_new_data(new_data, training_columns):

    # Convert to DataFrame
    df = pd.DataFrame([new_data])

    # Binary columns mapping
    binary_cols = ['News-Preference', 'News-Trust', 'Shares-News']
    binary_map = {'Yes': 1, 'No': 0}
    for col in binary_cols:
        if col in df.columns:
            df[col] = df[col].replace(binary_map).astype(int)

    # Ordinal mappings for categorical columns
    emotional_response_map = {'Sad': 1, 'Little Fear': 2, 'Very Fear': 3, 'Verry Fear': 3}
    dwelling_on_news = {'NoThinking': 1, 'Thinking': 2, 'OverThink': 3}

    # Map Emotional-Response
    if 'Emotional-Response' in df.columns:
        df['Emotional-Response'] = df['Emotional-Response'].map(emotional_response_map)

    # Map Dwelling-On-News
    if 'Dwelling-On-News' in df.columns:
        df['Dwelling-On-News'] = df['Dwelling-On-News'].map(dwelling_on_news)

    # One-hot encoding for categorical columns
    if 'News-Source' in df.columns:
        news_source_dummies = pd.get_dummies(df['News-Source'], prefix='NewsSource').astype(int)
        df = df.drop(['News-Source'], axis=1)
        df = pd.concat([df, news_source_dummies], axis=1)

    if 'Fear-Reaction' in df.columns:
        fear_reaction_dummies = pd.get_dummies(df['Fear-Reaction'], prefix='FearReaction').astype(int)
        df = df.drop(['Fear-Reaction'], axis=1)
        df = pd.concat([df, fear_reaction_dummies], axis=1)

    # Ensure all columns from training data exist in the new data
    # For missing columns, add them with 0 values
    for col in training_columns:
        if col not in df.columns:
            df[col] = 0

    # Ensure only columns from training data are included
    df = df[training_columns]

    return df

In [31]:
import warnings
warnings.filterwarnings("ignore")
new_data = {
    "News-Source": "phone",
    "News-Preference": "No",
    "News-Trust": "No",
    "Emotional-Response": "Little Fear",
    "Shares-News": "No",
    "Dwelling-On-News": "Thinking",
    "Fear-Reaction": "Talking with Dad"
}

encoded_data = encode_new_data(new_data, X.columns)

prediction = best_rf.predict(encoded_data)
prediction_prob = best_rf.predict_proba(encoded_data)

print(f"Prediction: {'Has nightmares' if prediction[0] == 1 else 'No nightmares'}")
print(f"Probability of having nightmares: {prediction_prob[0][1]:.2f}")

Prediction: No nightmares
Probability of having nightmares: 0.40
