# Dealing with Duplicate Values:

## Detect duplicate rows: identify rows with identical values across all columns.

In [2]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from sklearn.preprocessing import MinMaxScaler

In [4]:
df = pd.read_csv('/Users/sandipshrestha/My Files/BIT-2020/python/DataSet/titanic.csv')

In [5]:
# Identify duplicate rows
duplicates = df.duplicated()

In [6]:
print(df[duplicates])

Empty DataFrame
Columns: [PassengerId, Survived, Pclass, Name, Sex, Age, SibSp, Parch, Ticket, Fare, Cabin, Embarked]
Index: []


In [7]:
print(df)

     PassengerId  Survived  Pclass  \
0              1         0       3   
1              2         1       1   
2              3         1       3   
3              4         1       1   
4              5         0       3   
..           ...       ...     ...   
886          887         0       2   
887          888         1       1   
888          889         0       3   
889          890         1       1   
890          891         0       3   

                                                  Name     Sex   Age  SibSp  \
0                              Braund, Mr. Owen Harris    male  22.0      1   
1    Cumings, Mrs. John Bradley (Florence Briggs Th...  female  38.0      1   
2                               Heikkinen, Miss. Laina  female  26.0      0   
3         Futrelle, Mrs. Jacques Heath (Lily May Peel)  female  35.0      1   
4                             Allen, Mr. William Henry    male  35.0      0   
..                                                 ...     ...   ... 

In [10]:
specific_cols = ['PassengerId', 'Survived','Name', 'Ticket']

In [11]:
duplicates = df.duplicated(subset=specific_cols)

In [12]:
print(duplicates)

0      False
1      False
2      False
3      False
4      False
       ...  
886    False
887    False
888    False
889    False
890    False
Length: 891, dtype: bool


## Dealing with Duplicate Values

In [13]:
num_duplicates = duplicates.sum()
print("Number of duplicate rows:", num_duplicates)

Number of duplicate rows: 0


In [19]:
df['Ticket'].duplicated().sum()

210

In [14]:
for col in df.columns:
    print(f"\n**Unique values in '{col}' column:**")
    print(df[col].nunique())  # Count unique values
    print(f"\n**Top 5 most frequent values in '{col}' column:")
    print(df[col].value_counts().head(5))  # Display top 5 frequent values


**Unique values in 'PassengerId' column:**
891

**Top 5 most frequent values in 'PassengerId' column:
PassengerId
1      1
599    1
588    1
589    1
590    1
Name: count, dtype: int64

**Unique values in 'Survived' column:**
2

**Top 5 most frequent values in 'Survived' column:
Survived
0    549
1    342
Name: count, dtype: int64

**Unique values in 'Pclass' column:**
3

**Top 5 most frequent values in 'Pclass' column:
Pclass
3    491
1    216
2    184
Name: count, dtype: int64

**Unique values in 'Name' column:**
891

**Top 5 most frequent values in 'Name' column:
Name
Braund, Mr. Owen Harris             1
Boulos, Mr. Hanna                   1
Frolicher-Stehli, Mr. Maxmillian    1
Gilinski, Mr. Eliezer               1
Murdlin, Mr. Joseph                 1
Name: count, dtype: int64

**Unique values in 'Sex' column:**
2

**Top 5 most frequent values in 'Sex' column:
Sex
male      577
female    314
Name: count, dtype: int64

**Unique values in 'Age' column:**
88

**Top 5 most frequent 

In [8]:
df_cleaned = df.drop_duplicates()

In [9]:
df_cleaned

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
0,1,0,3,"Braund, Mr. Owen Harris",male,22.0,1,0,A/5 21171,7.2500,,S
1,2,1,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,38.0,1,0,PC 17599,71.2833,C85,C
2,3,1,3,"Heikkinen, Miss. Laina",female,26.0,0,0,STON/O2. 3101282,7.9250,,S
3,4,1,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",female,35.0,1,0,113803,53.1000,C123,S
4,5,0,3,"Allen, Mr. William Henry",male,35.0,0,0,373450,8.0500,,S
...,...,...,...,...,...,...,...,...,...,...,...,...
886,887,0,2,"Montvila, Rev. Juozas",male,27.0,0,0,211536,13.0000,,S
887,888,1,1,"Graham, Miss. Margaret Edith",female,19.0,0,0,112053,30.0000,B42,S
888,889,0,3,"Johnston, Miss. Catherine Helen ""Carrie""",female,,1,2,W./C. 6607,23.4500,,S
889,890,1,1,"Behr, Mr. Karl Howell",male,26.0,0,0,111369,30.0000,C148,C


## Decide on the treatment strategy: Depending on the nature of your data and the analysis you're conducting, you can choose to either remove duplicates entirely or keep one instance of each duplicate.

In [16]:
# Remove duplicates based on specific columns and keep the first occurrence
df_deduplicated = df.drop_duplicates(subset=specific_cols, keep='first')


In [17]:
df_deduplicated

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked,duplicate_flag
0,1,0,3,"Braund, Mr. Owen Harris",male,22.0,1,0,A/5 21171,7.2500,,S,False
1,2,1,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,38.0,1,0,PC 17599,71.2833,C85,C,False
2,3,1,3,"Heikkinen, Miss. Laina",female,26.0,0,0,STON/O2. 3101282,7.9250,,S,False
3,4,1,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",female,35.0,1,0,113803,53.1000,C123,S,False
4,5,0,3,"Allen, Mr. William Henry",male,35.0,0,0,373450,8.0500,,S,False
...,...,...,...,...,...,...,...,...,...,...,...,...,...
886,887,0,2,"Montvila, Rev. Juozas",male,27.0,0,0,211536,13.0000,,S,False
887,888,1,1,"Graham, Miss. Margaret Edith",female,19.0,0,0,112053,30.0000,B42,S,False
888,889,0,3,"Johnston, Miss. Catherine Helen ""Carrie""",female,,1,2,W./C. 6607,23.4500,,S,False
889,890,1,1,"Behr, Mr. Karl Howell",male,26.0,0,0,111369,30.0000,C148,C,False


## Implement the chosen strategy: remove duplicates based on specific criteria, such as ticket number or passenger name.


In [22]:
df[df['Ticket'].duplicated()].head(2)

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked,duplicate_flag
24,25,0,3,"Palsson, Miss. Torborg Danira",female,8.0,3,1,349909,21.075,,S,False
71,72,0,3,"Goodwin, Miss. Lillian Amy",female,16.0,5,2,CA 2144,46.9,,S,False


In [23]:
df['Ticket'].shape

(891,)

In [24]:
df['Ticket'].drop_duplicates().shape

(681,)

In [25]:
df.drop_duplicates(subset = ['Name','Ticket']).shape

(891, 13)

## Validate the effectiveness of duplicate removal: assess the impact on data integrity.

In [26]:
df_org = pd.read_csv('/Users/sandipshrestha/My Files/BIT-2020/python/DataSet/titanic.csv')

In [27]:
df_org.head()

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
0,1,0,3,"Braund, Mr. Owen Harris",male,22.0,1,0,A/5 21171,7.25,,S
1,2,1,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,38.0,1,0,PC 17599,71.2833,C85,C
2,3,1,3,"Heikkinen, Miss. Laina",female,26.0,0,0,STON/O2. 3101282,7.925,,S
3,4,1,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",female,35.0,1,0,113803,53.1,C123,S
4,5,0,3,"Allen, Mr. William Henry",male,35.0,0,0,373450,8.05,,S


In [28]:
print(df_org)

     PassengerId  Survived  Pclass  \
0              1         0       3   
1              2         1       1   
2              3         1       3   
3              4         1       1   
4              5         0       3   
..           ...       ...     ...   
886          887         0       2   
887          888         1       1   
888          889         0       3   
889          890         1       1   
890          891         0       3   

                                                  Name     Sex   Age  SibSp  \
0                              Braund, Mr. Owen Harris    male  22.0      1   
1    Cumings, Mrs. John Bradley (Florence Briggs Th...  female  38.0      1   
2                               Heikkinen, Miss. Laina  female  26.0      0   
3         Futrelle, Mrs. Jacques Heath (Lily May Peel)  female  35.0      1   
4                             Allen, Mr. William Henry    male  35.0      0   
..                                                 ...     ...   ... 

In [29]:
df_dup = df['Ticket'].drop_duplicates()

In [33]:
df_dup.head()

0           A/5 21171
1            PC 17599
2    STON/O2. 3101282
3              113803
4              373450
Name: Ticket, dtype: object