In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

In [None]:
import pandas as pd
import numpy as np

# Load the dataset
data_path = '/kaggle/input/correlation-data/data/merged_data.xlsx'
df = pd.read_excel(data_path)

# Display the first few rows of the DataFrame to understand its structure
print(df.head())

# Display the data types of each column
print(df.dtypes)

# Check for missing values in the DataFrame
missing_values = df.isnull().sum()
print("Missing values in each column:\n", missing_values)

# Handle missing values appropriately
# For simplicity, we will fill numerical missing values with the mean of their respective columns
numerical_cols = df.select_dtypes(include=[np.number]).columns
df[numerical_cols] = df[numerical_cols].fillna(df[numerical_cols].mean())


In [None]:
import numpy as np
import matplotlib.pyplot as plt

# Rename the column for easier access
df.rename(columns={'GDP Per Capita (US$)': 'GDP_per_capita'}, inplace=True)

# Calculate the correlation coefficient
correlation = df['GDP_per_capita'].corr(df['Criminality'])
print("Correlation coefficient between GDP per capita and the overall organized crime index:", correlation)

# Scatter plot to visualize the relationship
plt.figure(figsize=(10, 6))
plt.scatter(df['GDP_per_capita'], df['Criminality'], alpha=0.5)
plt.title('Scatter Plot of GDP per Capita vs. Organized Crime Index')
plt.xlabel('GDP per Capita (US$)')
plt.ylabel('Organized Crime Index')
plt.grid(True)
plt.show()


In [None]:
import seaborn as sns

# Grouping data by 'Continent' to analyze regional differences
regional_group = df.groupby('Continent').agg({
    'GDP_per_capita': 'mean',
    'Criminality': 'mean'
}).reset_index()

# Select numerical columns for descriptive statistics
numerical_cols = df.select_dtypes(include=[np.number])
print("Descriptive Statistics for Numerical Data:")
print(numerical_cols.describe())

# Visualizing the distribution of GDP per Capita
plt.figure(figsize=(10, 6))
sns.histplot(df['GDP_per_capita'], kde=True, color='blue')
plt.title('Distribution of GDP per Capita')
plt.xlabel('GDP per Capita (US Dollars)')
plt.ylabel('Frequency')
plt.grid(True)
plt.show()

# Box plot for GDP per Capita to identify outliers
plt.figure(figsize=(10, 6))
sns.boxplot(x=df['GDP_per_capita'])
plt.title('Box Plot of GDP per Capita')
plt.xlabel('GDP per Capita (US Dollars)')
plt.grid(True)
plt.show()

# Visualizing the average GDP per capita and Criminality by continent
plt.figure(figsize=(12, 6))
sns.barplot(x='Continent', y='GDP_per_capita', data=regional_group)
plt.title('Average GDP per Capita by Continent')
plt.ylabel('Average GDP per Capita (US$)')
plt.xlabel('Continent')
plt.xticks(rotation=45)
plt.show()

plt.figure(figsize=(12, 6))
sns.barplot(x='Continent', y='Criminality', data=regional_group)
plt.title('Average Organized Crime Index by Continent')
plt.ylabel('Average Organized Crime Index')
plt.xlabel('Continent')
plt.xticks(rotation=45)
plt.show()

# Calculating correlation within each continent
continents = df['Continent'].unique()
for continent in continents:
    subset = df[df['Continent'] == continent]
    correlation = subset['GDP_per_capita'].corr(subset['Criminality'])
    print(f"Correlation between GDP per capita and organized crime in {continent}: {correlation:.2f}")

# Exploring governance factors by continent
governance_factors = ['Political leadership and governance', 'Government transparency and accountability', 'International cooperation']
continent_governance = df.groupby('Continent')[governance_factors].mean().reset_index()

plt.figure(figsize=(12, 8))
sns.heatmap(continent_governance.set_index('Continent'), annot=True, cmap='coolwarm', fmt=".2f")
plt.title('Governance Factors by Continent')
plt.show()


In [None]:
# Import necessary libraries
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt

# Correctly select columns related to resilience and governance factors
resilience_governance_columns = df.columns[df.columns.str.contains('leadership|transparency|cooperation|policies|judicial|law enforcement|territorial|integrity|money laundering|regulatory|support|prevention')]

# Filter the DataFrame to include only numeric columns for correlation calculation
numeric_df = df[resilience_governance_columns].select_dtypes(include=[np.number])

# Calculate the correlation matrix for resilience and governance factors with the organized crime index
correlation_matrix = numeric_df.join(df['Criminality']).corr()

# Visualize the correlation matrix using a heatmap
plt.figure(figsize=(12, 8))
sns.heatmap(correlation_matrix, annot=True, cmap='coolwarm', fmt=".2f")
plt.title('Correlation between Organized Crime Index and Resilience & Governance Factors')
plt.show()

# Analyze the impact of resilience and governance factors on organized crime
print("Correlation coefficients between organized crime index and resilience & governance factors:")
print(correlation_matrix['Criminality'].drop('Criminality').sort_values(ascending=False))


In [None]:
# Column names in the DataFrame: Index(['Continent', 'Region', 'Country', 'Criminality', 'Criminal markets',
#        'Human trafficking', 'Human smuggling', 'Arms trafficking',
#        'Flora crimes', 'Fauna crimes', 'Non-renewable resource crimes',
#        'Heroin trade', 'Cocaine trade', 'Cannabis trade',
#        'Synthetic drug trade', 'Criminal actors', 'Mafia-style groups',
#        'Criminal networks', 'State-embedded actors', 'Foreign actors',
#        'Resilience', 'Political leadership and governance',
#        'Government transparency and accountability',
#        'International cooperation', 'National policies and laws',
#        'Judicial system and detention', 'Law enforcement',
#        'Territorial integrity', 'Anti-money laundering',
#        'Economic regulatory capacity', 'Victim and witness support',
#        'Prevention', 'Non-state actors', 'Year', 'Unit',
#        'GDP Per Capita (US$)', 'Code Value'],
#       dtype='object')

In [None]:
import pandas as pd
import plotly.express as px

# Create a choropleth map
fig1 = px.choropleth(df, locations='Country', color='Criminality', scope='world')

# Create a scatter plot
fig2 = px.scatter(df, x='GDP_per_capita', y='Criminality', trendline='ols')

# Create a line chart
fig3 = px.line(df, x='Year', y='Criminality')

# Create a bar chart
fig4 = px.bar(df, x='Criminal markets', y='Criminality')

# Create a heatmap
numeric_df = df.select_dtypes(include=['float64', 'int64'])
fig5 = px.imshow(numeric_df.corr())

# Create a pie chart
fig6 = px.pie(df, values='Criminality', names='Criminal markets')

# Create a histogram
fig7 = px.histogram(df['Criminality'])

# Create a box plot
fig8 = px.box(df, y='Criminality', x='Region')

# Show the graphs
figs = [fig1, fig2, fig3, fig4, fig5, fig6, fig7, fig8]
for fig in figs:
    fig.show()
