In [2]:
# importing necessary libraries

import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
%matplotlib inline

In [3]:
# loading the downloaded csv file
df = pd.read_csv('who_suicide_statistics.csv')


In [4]:
# first five rows
df.head()

Unnamed: 0,country,year,sex,age,suicides_no,population
0,Albania,1985,female,15-24 years,,277900.0
1,Albania,1985,female,25-34 years,,246800.0
2,Albania,1985,female,35-54 years,,267500.0
3,Albania,1985,female,5-14 years,,298300.0
4,Albania,1985,female,55-74 years,,138700.0


In [5]:
# checking the column names

print("The name of the column is: ")
for column in df.columns:
    print(f"'{column}' ")



The name of the column is: 
'country' 
'year' 
'sex' 
'age' 
'suicides_no' 
'population' 


In [6]:
# checking the data type of each column

print("The data type of each column is: ")
df.dtypes

The data type of each column is: 


country         object
year             int64
sex             object
age             object
suicides_no    float64
population     float64
dtype: object

In [7]:
# check for the number of rows and columns

# The number of rows in dataset is 
print(f"The number of rows in our dataset is =  '{df.shape[0]}' \n")
# The number of column in our dataset is
print(f"The number of columns in our dataset is = '{ df.shape[1]}'")

The number of rows in our dataset is =  '43776' 

The number of columns in our dataset is = '6'


---
### **Observation:** 

- Our dataset has 6 coulmns in which three of them are catagorical and three are numeric.

- Columns like `country` , 'sex' , 'age' have object data type whereas `year` has int data type. In contrast to that both `suicides_no` and `population` have float data type.

- The number of rows in our dataset is 43776
---

In [8]:
# check for the value counts in country column

df['country'].nunique()

141

---

We have a record of total 141 countries across the globe.

---

In [9]:
# check for the unique values in each column

df['year'].unique()

array([1985, 1986, 1987, 1988, 1989, 1990, 1991, 1992, 1993, 1994, 1995,
       1996, 1997, 1998, 1999, 2000, 2001, 2002, 2003, 2004, 2005, 2006,
       2007, 2008, 2009, 2010, 2011, 2012, 2013, 2014, 2015, 1983, 1984,
       2016, 1979, 1980, 1981, 1982], dtype=int64)

In [10]:
# getting count of unqiue values in year column

df.year.nunique()

38

---

We have data which spans over 38 years since 1985. to 2016


---

In [11]:
# check for the unique values count in sex column

df.sex.value_counts()

sex
female    21888
male      21888
Name: count, dtype: int64

---

Out of total `43776` records in our dataset have of them are male and half of them are females

---


In [19]:
# check for how many unique age groups we have

df['age'].unique()


array(['15-24 years', '25-34 years', '35-54 years', '5-14 years',
       '55-74 years', '75+ years'], dtype=object)

---

As we can see we have six age groups

|No. | Age Group |
| --- | --- |
| 1 | 5-14 years |
| 2 | 15-24 years |
| 3 | 25-34 years |
| 4 | 35-54 years |
| 5 | 55-74 years |
| 6 | 75+ years |
---

In [31]:
# checking the count of unique values of each age group

pd.DataFrame(df.age.value_counts())

Unnamed: 0_level_0,count
age,Unnamed: 1_level_1
15-24 years,7296
25-34 years,7296
35-54 years,7296
5-14 years,7296
55-74 years,7296
75+ years,7296


---

All the age groups in our dataset have equal number of records each age group contains 7296 records

----



In [33]:
# check for the null values in the dataset

df.isnull().sum()

country           0
year              0
sex               0
age               0
suicides_no    2256
population     5460
dtype: int64

---

As we can see `suicides_no` and `population` columns have null values in our dataset. `suicides_no` have nan values `2256` and `population` have nan values `5460`

---

In [42]:
# to show values in float format

pd.options.display.float_format = '{:.2f}'.format

In [43]:
# applying descriptive statistics

df.describe()

Unnamed: 0,year,suicides_no,population
count,43776.0,41520.0,38316.0
mean,1998.5,193.32,1664091.14
std,10.34,800.59,3647231.23
min,1979.0,0.0,259.0
25%,1990.0,1.0,85112.75
50%,1999.0,14.0,380655.0
75%,2007.0,91.0,1305698.0
max,2016.0,22338.0,43805214.0


In [65]:

numeric_columns = df.select_dtypes(include=['int64', 'float64'])

# Calculating the Pearson correlation matrix
correlation_matrix = numeric_columns.corr(method='pearson')

# Display the correlation matrix
correlation_matrix


Unnamed: 0,year,suicides_no,population
year,1.0,-0.01,0.03
suicides_no,-0.01,1.0,0.61
population,0.03,0.61,1.0


---

- As we can see almost all the countries have some level of suicide because the minimum value of suicide rate is zero
- Secondly we can see that the country have least population count of 259

