In [177]:
import pandas as pd  
import numpy as np
from sklearn.model_selection import train_test_split 
from sklearn.ensemble import RandomForestClassifier 
from sklearn.metrics import accuracy_score, f1_score, precision_score, recall_score 

In [178]:
data = pd.read_csv('C:/Users/kareem/Downloads/Customer Churn.csv')
data.columns


Index(['RowNumber', 'CustomerId', 'Surname', 'CreditScore', 'Geography',
       'Gender', 'Age', 'Tenure', 'Balance', 'NumOfProducts', 'HasCrCard',
       'IsActiveMember', 'EstimatedSalary', 'Exited'],
      dtype='object')

In [179]:
def fitness_function(data):
    missing_values = data.isnull().sum().sum()  
    duplicates = data.duplicated().sum()
    return missing_values + duplicates 

In [180]:
def particle_swarm_optimization(data, max_iterations=10, n_particles=5):
    
    particles = [data.copy() for _ in range(n_particles)]  
    
    p_best = particles.copy()  
    
    p_best_fitness = [fitness_function(p) for p in particles]  
    
    g_best = p_best[np.argmin(p_best_fitness)]  
    for _ in range(max_iterations):  
        for i, particle in enumerate(particles):
            
            if np.random.rand() > 0.5:
                particle = particle.drop(particle.sample(1).index)

            for col in particle.select_dtypes(include='number').columns:  
                particle[col] = particle[col].fillna(particle[col].mean())  
            
            for col in particle.select_dtypes(exclude='number').columns:  
                particle[col] = particle[col].fillna(particle[col].mode()[0])  

            fitness = fitness_function(particle)

            if fitness < p_best_fitness[i]:
                p_best[i] = particle
                p_best_fitness[i] = fitness

            if fitness < fitness_function(g_best):
                g_best = particle

    return g_best

In [181]:
cleaned_data = particle_swarm_optimization(data) #10 iteration

cleaned_data = pd.get_dummies(cleaned_data, drop_first=True) 

X = cleaned_data.drop('Exited', axis=1, errors='ignore')
  
if 'Exited' in cleaned_data.columns:
    y = cleaned_data['Exited']  
else:
    print("Error: Target column 'Churn_Yes' not found.")  
    exit()

In [215]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

model = RandomForestClassifier(random_state=42)  
model.fit(X_train, y_train)  

y_pred = model.predict(X_test) 

In [221]:
accuracy = accuracy_score(y_test, y_pred)
precision = precision_score(y_test, y_pred)
recall = recall_score(y_test, y_pred)
f1 = f1_score(y_test, y_pred)

print("accurecy : " , accuracy)
print("precision : " , precision)
print("recall : " ,  recall)
print("F1 Score : " , f1)
print((2*(precision*recall)/(precision+recall)))

accurecy :  0.797752808988764
precision :  0.8448275862068966
recall :  0.6447368421052632
F1 Score :  0.7313432835820896
0.7313432835820897


In [184]:
titanic1 =pd.read_csv("C:/Users/kareem/Downloads/titanic-data.csv")
fitness_function(titanic1) #data is not clean 

866

In [185]:
titanic1.dtypes

PassengerId      int64
Survived         int64
Pclass           int64
Name            object
Sex             object
Age            float64
SibSp            int64
Parch            int64
Ticket          object
Fare           float64
Cabin           object
Embarked        object
dtype: object

In [186]:
titanic1.head()

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
0,1,0,3,"Braund, Mr. Owen Harris",male,22.0,1,0,A/5 21171,7.25,,S
1,2,1,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,38.0,1,0,PC 17599,71.2833,C85,C
2,3,1,3,"Heikkinen, Miss. Laina",female,26.0,0,0,STON/O2. 3101282,7.925,,S
3,4,1,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",female,35.0,1,0,113803,53.1,C123,S
4,5,0,3,"Allen, Mr. William Henry",male,35.0,0,0,373450,8.05,,S


In [187]:
titanic1.columns

Index(['PassengerId', 'Survived', 'Pclass', 'Name', 'Sex', 'Age', 'SibSp',
       'Parch', 'Ticket', 'Fare', 'Cabin', 'Embarked'],
      dtype='object')

In [188]:
# PassengerId - this is a just a generated Id
# Pclass - which class did the passenger ride - first, second or third
# Name - self explanatory
# Sex - male or female
# Age
# SibSp - were the passenger's spouse or siblings with them on the ship
# Parch - were the passenger's parents or children with them on the ship
# Ticket - ticket number
# Fare - ticker price
# Cabin
# Embarked - port of embarkation
# Survived - did the passenger survive the sinking of the Titanic?

In [189]:
titanic1['Survived'].value_counts()
#0 is dead 
#1 is also dead but survived 

Survived
0    549
1    342
Name: count, dtype: int64

In [190]:
titanic1.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 891 entries, 0 to 890
Data columns (total 12 columns):
 #   Column       Non-Null Count  Dtype  
---  ------       --------------  -----  
 0   PassengerId  891 non-null    int64  
 1   Survived     891 non-null    int64  
 2   Pclass       891 non-null    int64  
 3   Name         891 non-null    object 
 4   Sex          891 non-null    object 
 5   Age          714 non-null    float64
 6   SibSp        891 non-null    int64  
 7   Parch        891 non-null    int64  
 8   Ticket       891 non-null    object 
 9   Fare         891 non-null    float64
 10  Cabin        204 non-null    object 
 11  Embarked     889 non-null    object 
dtypes: float64(2), int64(5), object(5)
memory usage: 83.7+ KB


In [191]:
titanic1['Cabin']
#there are null values 

0       NaN
1       C85
2       NaN
3      C123
4       NaN
       ... 
886     NaN
887     B42
888     NaN
889    C148
890     NaN
Name: Cabin, Length: 891, dtype: object

In [192]:
titanic1['Sex'] = titanic1['Sex'].apply(lambda x: 1 if x == 'male' else 0)

In [193]:
titanic1['Sex']
#replaced male : 1 and female : 0

0      1
1      0
2      0
3      0
4      1
      ..
886    1
887    0
888    0
889    1
890    1
Name: Sex, Length: 891, dtype: int64

In [194]:
titanic1.isnull().sum()

PassengerId      0
Survived         0
Pclass           0
Name             0
Sex              0
Age            177
SibSp            0
Parch            0
Ticket           0
Fare             0
Cabin          687
Embarked         2
dtype: int64

In [195]:
#start with age 
titanic1['Age']=titanic1['Age'].fillna(titanic1['Age'].mean())
titanic1.isnull().sum()

PassengerId      0
Survived         0
Pclass           0
Name             0
Sex              0
Age              0
SibSp            0
Parch            0
Ticket           0
Fare             0
Cabin          687
Embarked         2
dtype: int64

In [196]:
#capin replace
titanic1['Cabin']=titanic1['Cabin'].fillna(titanic1['Cabin'].mode()[0])
titanic1.isna().sum()

PassengerId    0
Survived       0
Pclass         0
Name           0
Sex            0
Age            0
SibSp          0
Parch          0
Ticket         0
Fare           0
Cabin          0
Embarked       2
dtype: int64

In [197]:
titanic1['Embarked']=titanic1['Embarked'].fillna(titanic1['Embarked'].mode()[0])
titanic1.isna().sum()

PassengerId    0
Survived       0
Pclass         0
Name           0
Sex            0
Age            0
SibSp          0
Parch          0
Ticket         0
Fare           0
Cabin          0
Embarked       0
dtype: int64

In [198]:
#data is cleaned now , then we search for outlier 
q1=titanic1['Fare'].quantile(0.25)
q3=titanic1['Fare'].quantile(0.75)
iqr =q3-q1


In [199]:
lower_bound = q1 - 1.5 * iqr
upper_bound = q3 + 1.5 * iqr

# Identify outliers
outliers = titanic1[(titanic1['Fare'] < lower_bound) | (titanic1['Fare'] > upper_bound)]
print("Outliers in Fare:")
print(outliers)

Outliers in Fare:
     PassengerId  Survived  Pclass  \
1              2         1       1   
27            28         0       1   
31            32         1       1   
34            35         0       1   
52            53         1       1   
..           ...       ...     ...   
846          847         0       3   
849          850         1       1   
856          857         1       1   
863          864         0       3   
879          880         1       1   

                                                  Name  Sex        Age  SibSp  \
1    Cumings, Mrs. John Bradley (Florence Briggs Th...    0  38.000000      1   
27                      Fortune, Mr. Charles Alexander    1  19.000000      3   
31      Spencer, Mrs. William Augustus (Marie Eugenie)    0  29.699118      1   
34                             Meyer, Mr. Edgar Joseph    1  28.000000      1   
52            Harper, Mrs. Henry Sleeper (Myna Haxtun)    0  49.000000      1   
..                                     

In [200]:
titanic1 = titanic1[(titanic1['Fare'] > lower_bound) & (titanic1['Fare'] < upper_bound)]

In [201]:
#data is cleaned now , then we search for outlier 
q1=titanic1['Age'].quantile(0.25)
q3=titanic1['Age'].quantile(0.75)
iqr =q3-q1


In [202]:
lower_bound = q1 - 1.5 * iqr
upper_bound = q3 + 1.5 * iqr

# Identify outliers
outliers = titanic1[(titanic1['Age'] < lower_bound) | (titanic1['Age'] > upper_bound)]
print("Outliers in Age:")
print(outliers)

Outliers in Age:
     PassengerId  Survived  Pclass                              Name  Sex  \
6              7         0       1           McCarthy, Mr. Timothy J    1   
7              8         0       3    Palsson, Master. Gosta Leonard    1   
11            12         1       1          Bonnell, Miss. Elizabeth    0   
15            16         1       2  Hewlett, Mrs. (Mary D Kingcome)     0   
16            17         0       3              Rice, Master. Eugene    1   
..           ...       ...     ...                               ...  ...   
803          804         1       3   Thomas, Master. Assad Alexander    1   
824          825         0       3      Panula, Master. Urho Abraham    1   
827          828         1       2             Mallet, Master. Andre    1   
831          832         1       2   Richards, Master. George Sibley    1   
851          852         0       3               Svensson, Mr. Johan    1   

       Age  SibSp  Parch           Ticket     Fare    Cabi

In [203]:
titanic1 = titanic1[(titanic1['Age'] > lower_bound) & (titanic1['Age'] < upper_bound)]

In [204]:
cleaned_titanic1 = particle_swarm_optimization(titanic1)

cleaned_titanic1 = pd.get_dummies(cleaned_titanic1, drop_first=True)

X = cleaned_titanic1.drop('Survived', axis=1, errors='ignore')  
if 'Survived' in cleaned_titanic1.columns:
    y = cleaned_titanic1['Survived']  
else:
    print("Error: Target column 'Churn_Yes' not found.") 
    exit()

In [205]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

model = RandomForestClassifier(random_state=42)  
model.fit(X_train, y_train)  

y_pred = model.predict(X_test)  

In [206]:
accuracy = accuracy_score(y_test, y_pred)
precision = precision_score(y_test, y_pred, average='binary')
recall = recall_score(y_test, y_pred, average='binary')
f1 = f1_score(y_test, y_pred, average='binary')

print("accurecy : " , accuracy)
print("precision : " , precision)
print("recall : " ,  recall)
print("F1 Score : " , f1)

accurecy :  0.8428571428571429
precision :  0.9333333333333333
recall :  0.5833333333333334
F1 Score :  0.717948717948718


In [207]:
titanic2 = pd.read_csv("C:/Users/kareem/Downloads/titanic-data.csv")

In [208]:
cleaned_titanic2 = particle_swarm_optimization(titanic2)

cleaned_titanic2= pd.get_dummies(cleaned_titanic2, drop_first=True)

X = cleaned_titanic2.drop('Survived', axis=1, errors='ignore')  
if 'Survived' in cleaned_titanic2.columns:
    y = cleaned_titanic2['Survived']  
else:
    print("Error: Target column 'Churn_Yes' not found.") 
    exit()

In [209]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

model = RandomForestClassifier(random_state=42)  
model.fit(X_train, y_train)  

y_pred = model.predict(X_test) 

In [222]:
accuracy = accuracy_score(y_test, y_pred)
precision = precision_score(y_test, y_pred)
recall = recall_score(y_test, y_pred)
f1 = f1_score(y_test, y_pred )

print("accurecy : " , accuracy)
print("precision : " , precision)
print("recall : " ,  recall)
print("F1 Score : " , f1)

accurecy :  0.797752808988764
precision :  0.8448275862068966
recall :  0.6447368421052632
F1 Score :  0.7313432835820896
