In [82]:
 #Import necessary libraries
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import OneHotEncoder, StandardScaler
from sklearn.feature_selection import chi2, SelectKBest
from sklearn.ensemble import RandomForestClassifier
from scipy.stats import chi2_contingency

In [83]:
# Load the dataset with explicit encoding
df = pd.read_csv('data.csv', encoding='latin-1') # or 'ISO-8859-1', 'cp1252'

In [84]:
def categorize_nkill(n):
    if n == 0:
        return 0
    elif n in [1, 2]:
        return 1
    else:
        return 2

df['nkill_category'] = df['nkill'].apply(categorize_nkill)

In [85]:
# Drop the original 'nkill' column
df.drop(columns=['nkill'], inplace=True)

# Identify categorical variables
categorical_cols = df.select_dtypes(include=['object']).columns.tolist()

# One-Hot Encoding for categorical variables
encoder = OneHotEncoder(drop='first', sparse_output=False, handle_unknown='ignore') # Changed 'sparse' to 'sparse_output'
encoded_features = pd.DataFrame(encoder.fit_transform(df[categorical_cols]))
encoded_features.columns = encoder.get_feature_names_out(categorical_cols)

In [86]:
# Drop original categorical columns and merge one-hot encoded data
df = df.drop(columns=categorical_cols).reset_index(drop=True)
df = pd.concat([df, encoded_features], axis=1)

# Fill missing values with median (for numerical) or mode (for categorical)
df.fillna(df.median(numeric_only=True), inplace=True)

In [87]:
# Define Features (X) and Target (y)
X = df.drop(columns=['nkill_category'])  # Features
y = df['nkill_category']  # Target variable

# Split into training & test set

In [88]:
negative_columns = X.columns[(X < 0).any()].tolist()
print("Columns with negative values:", negative_columns)
X = X.drop(['nperps', 'claimed', 'compclaim', 'INT_LOG', 'INT_IDEO', 'INT_MISC', 'INT_ANY'], axis = 1)


Columns with negative values: ['nperps', 'claimed', 'compclaim', 'INT_LOG', 'INT_IDEO', 'INT_MISC', 'INT_ANY']


In [89]:
negative_columns = X.columns[(X < 0).any()].tolist()
print("Columns with negative values:", negative_columns)

Columns with negative values: []


In [90]:

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42, stratify=y)

In [97]:
# Chi-Square Feature Selection
chi2_selector = SelectKBest(chi2, k='all')  # Select all features for ranking
chi2_selector.fit(X_train, y_train)
chi2_scores = chi2_selector.scores_

# Random Forest Feature Importance
rf_model = RandomForestClassifier(n_estimators=100, random_state=42)
rf_model.fit(X_train, y_train)
rf_importances = rf_model.feature_importances_

# Combine Feature Importance Scores
feature_importance_df = pd.DataFrame({
    'Feature': X_train.columns,
    'Chi-Square Score': chi2_scores,
    'Random Forest Importance': rf_importances
})

# Sort by importance
feature_importance_df = feature_importance_df.sort_values(by='Random Forest Importance', ascending=False)

# Display results
print("\nTop Features Based on Random Forest & Chi-Square:\n")
print(feature_importance_df.head(15))  # Show top 15 important features




Top Features Based on Random Forest & Chi-Square:

                        Feature  Chi-Square Score  Random Forest Importance
0                    Unnamed: 0      3.097524e+06                  0.027760
13                  attacktype1      6.533124e+02                  0.024925
1                       eventid      7.357909e+09                  0.024030
2                         iyear      7.360244e+01                  0.022231
27                 weapsubtype1      4.607537e+03                  0.019332
6                      latitude      1.261844e+02                  0.018324
7                     longitude      1.145092e+02                  0.017558
4                          iday      1.457449e+01                  0.015156
3                        imonth      2.421297e+00                  0.013609
17                 targsubtype1      3.751510e+03                  0.013216
12                      success      1.830832e+02                  0.013006
31926    weaptype1_txt_Firearms     

In [99]:
# Get the top 30 features based on Random Forest importance
top_30_features = feature_importance_df.nlargest(50000, 'Random Forest Importance')

# Display the top 30 features
print("Top 30 Features Based on Random Forest Importance:\n", top_30_features)


Top 30 Features Based on Random Forest Importance:
                                                Feature  Chi-Square Score  \
0                                           Unnamed: 0      3.097524e+06   
13                                         attacktype1      6.533124e+02   
1                                              eventid      7.357909e+09   
2                                                iyear      7.360244e+01   
27                                        weapsubtype1      4.607537e+03   
...                                                ...               ...   
26352                            target1_Tehsil Office      7.903278e-01   
26353     target1_Tehsil Secretary: Abdul Ahad Rehmani               NaN   
26355  target1_Telecommunication Engineer B.K. Ganjoor               NaN   
26358                         target1_Telegraph Office               NaN   
26359                       target1_Telephone Exchange               NaN   

       Random Forest Importance  
0

In [101]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, classification_report

# Select the top 20 features
selected_features = top_30_features['Feature'].tolist()

# Define new feature set (X) using top 20 features
X_top30 = X[selected_features]
y = df['nkill_category']  # Target variable

# Split into training & testing sets
X_train, X_test, y_train, y_test = train_test_split(X_top20, y, test_size=0.2, random_state=42, stratify=y)

# Train Random Forest Classifier
rf_model = RandomForestClassifier(n_estimators=100, random_state=42)
rf_model.fit(X_train, y_train)

# Predictions
y_pred = rf_model.predict(X_test)

# Evaluate Model Performance
accuracy = accuracy_score(y_test, y_pred)
print(f"Model Accuracy: {accuracy:.4f}")

# Classification Report
print("Classification Report:\n", classification_report(y_test, y_pred))


Model Accuracy: 0.7735
Classification Report:
               precision    recall  f1-score   support

           0       0.83      0.89      0.86      1556
           1       0.70      0.66      0.68       790
           2       0.68      0.55      0.61       440

    accuracy                           0.77      2786
   macro avg       0.73      0.70      0.72      2786
weighted avg       0.77      0.77      0.77      2786



In [119]:
# pip install streamlit streamlit_jupyter


In [115]:
# from streamlit_jupyter import StreamlitPatcher
# import streamlit as st

# # Patch Streamlit for Jupyter
# StreamlitPatcher().jupyter()

# st.title("Feature Selection & Random Forest Classification")
# st.write("This is a Streamlit app running inside Jupyter Notebook.")

# # Display Streamlit components in Jupyter
# st.write()


In [117]:
# import streamlit_jupyter
# print(dir(streamlit_jupyter))
