<div style="background-color: #333; padding: 40px; border: 2px solid #ffd700; border-radius: 10px; color: #ffd700; text-align: center; box-shadow: 0 4px 8px rgba(0, 0, 0, 0.2);">

<h1 style="font-size: 48px; font-weight: bold; color: #ffd700;">Japanese Universities</h1>

<img src="https://japaninsider.com/wp-content/uploads/2023/04/tohoku-university.jpeg" alt="Movie Reel" style="width: 500px; margin: 20px auto; border-radius: 10px; box-shadow: 0 4px 8px rgba(0, 0, 0, 0.2);">
    
</div>

<div style="border-radius: 10px; border: 2px solid #ffd700; padding: 15px; background-color: #333; font-size: 180%; text-align: center; color: #ffd700; font-weight: bold;"> Table of Contents 
</div>

<ul class="list-group" id="list-tab" role="tablist">
    <li><a href="#1.-Import-Libraries">1. Import Libraries</a></li><br>
    <li><a href="#2.-Load-data">2. Load data</a></li><br>
    <li><a href="#3.-Exploratory-Data-Analysis">3. Exploratory Data Analysis</a></li><br>
    <li><a href="#3.1-Data-quality">3.1 Data quality</a></li><br>
    <li><a href="#3.2-Geographical-Analysis">3.2 Geographical Analysis</a></li><br>
    <li><a href="#3.3-Institutional-Characteristics">3.3 Institutional Characteristics</a></li><br>
    <li><a href="#3.4-Temporal-Analysis">3.4 Temporal Analysis </a></li><br>
</ul>

## <div style="border-radius: 10px; border: 2px solid #ffd700; padding: 15px; background-color: #333; font-size: 120%; text-align: center; color: #ffd700; font-weight: bold;">1. Import Libraries</div>

In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from IPython.display import display
import plotly.express as px
import plotly.subplots as sp
import plotly.graph_objects as go

## <div style="border-radius: 10px; border: 2px solid #ffd700; padding: 15px; background-color: #333; font-size: 120%; text-align: center; color: #ffd700; font-weight: bold;">2. Load data</div>

In [None]:
df = pd.read_csv("/kaggle/input/japanese-universities/japanese_universities.csv", index_col = 0)
df

In [None]:
df.info()

In [None]:
df.describe()

## <div style="border-radius: 10px; border: 2px solid #ffd700; padding: 15px; background-color: #333; font-size: 120%; text-align: center; color: #ffd700; font-weight: bold;">3. Exploratory Data Analysis</div>

## <div style="border-radius: 10px; border: 2px solid #333; padding: 15px; background-color: #ffd700; font-size: 120%; text-align: left; color: #333; font-weight: bold;">3.1 Data quality</div>

### I | Check duplicates

In [None]:
duplicates = df.duplicated().sum()
print(duplicates)

### II | Check null and missing values

In [None]:
missing_values = df.isnull().sum()
total_missing_values = (missing_values).sum()
total_cells = np.product(df.shape)
percent_missing_values = (total_missing_values / total_cells)*100
print("Percent of data that is missing", percent_missing_values)
print(missing_values)

### III | Check unique values in each columns

In [None]:
for column in df.columns:
    num_distinct_values = len(df[column].unique())
    print(f"{column}: {num_distinct_values} distinct values")

### IV | Correlation Analysis

In [None]:
numeric_columns = df.select_dtypes(include=[np.number])
correlation_matrix = numeric_columns.corr()
correlation_matrix

In [None]:
fig, ax = plt.subplots() 
fig.set_size_inches(15,10)
sns.heatmap(correlation_matrix, vmax =.8, square = True, annot = True,cmap='YlGn' )
plt.title('Correlation Matrix',fontsize=15);

## <div style="border-radius: 10px; border: 2px solid #333; padding: 15px; background-color: #ffd700; font-size: 120%; text-align: left; color: #333; font-weight: bold;">3.2 Geographical Analysis</div>

In [None]:
df['review_rating'].fillna(df['review_rating'].median(), inplace=True)
df['review_count'].fillna(df['review_count'].median(), inplace=True)

In [None]:
fig_map = px.scatter_mapbox(df,
                            lat="latitude",
                            lon="longitude",
                            color="difficulty_rank",
                            size="review_count",
                            hover_data=["name", "faculty_count", "department_count"],
                            color_continuous_scale=px.colors.cyclical.IceFire,
                            size_max=15,
                            zoom=5,
                            mapbox_style="carto-positron",
                            title="Japanese Universities: Difficulty Rank, Reviews, and Departments")


fig_map.show()

In [None]:
state_counts = df['state'].value_counts()

plt.figure(figsize=(10, 6))
state_counts.plot(kind='bar', color='skyblue')
plt.title('University Distribution Across States')
plt.xlabel('State')
plt.ylabel('Number of Universities')
plt.show()

In [None]:
fig1 = px.bar(df, x='state', color='type', title='University Types Across States',
             labels={'state': 'State', 'type': 'University Type'},
             category_orders={'type': ['National', 'Public', 'Private']})

display(fig1)

## <div style="border-radius: 10px; border: 2px solid #333; padding: 15px; background-color: #ffd700; font-size: 120%; text-align: left; color: #333; font-weight: bold;">3.3 Institutional Characteristics</div>

In [None]:
fig, axes = plt.subplots(nrows=2, ncols=2, figsize=(16, 8))


axes[0, 0].set_title("Distribution of department_count")
sns.histplot(data=df, x="department_count", ax=axes[0, 0], kde=True)


axes[0, 1].set_title("Distribution of Faculty count")
sns.histplot(data=df, x="faculty_count", ax=axes[0, 1], kde=True)


axes[1, 0].set_title("Distribution of review count")
sns.histplot(data=df, x="review_count", ax=axes[1, 0], kde=True)


axes[1, 1].set_title("Distribution of review rating")
sns.histplot(data=df, x="review_rating", ax=axes[1, 1], kde=True)


plt.tight_layout()


plt.show()

In [None]:
fig = px.bar(df, x='type', y=['faculty_count', 'department_count'],
             barmode='group', labels={'value': 'Count', 'variable': 'Category'},
             title='Distribution of Faculties and Departments Among University Types')
fig.show()

In [None]:
plt.figure(figsize=(14, 6))

# Faculties
plt.subplot(1, 2, 1)
sns.boxplot(data=df, x='type', y='faculty_count', palette='viridis')
plt.title('Diversity of Universities in Terms of Faculties')
plt.xlabel('University Type')
plt.ylabel('Number of Faculties')

# Departments
plt.subplot(1, 2, 2)
sns.boxplot(data=df, x='type', y='department_count', palette='viridis')
plt.title('Diversity of Universities in Terms of Departments')
plt.xlabel('University Type')
plt.ylabel('Number of Departments')

plt.tight_layout()
plt.show()

## <div style="border-radius: 10px; border: 2px solid #333; padding: 15px; background-color: #ffd700; font-size: 120%; text-align: left; color: #333; font-weight: bold;">3.4 Temporal Analysis</div>

In [None]:
df['found'] = pd.to_datetime(df['found'])

# Create a new column for decades
df['decade'] = (df['found'].dt.year // 10) * 10

# Group by decade and count the number of universities
df_decade = df.groupby('decade').size().reset_index(name='count')


fig = px.line(df_decade, x='decade', y='count', markers=True,
              title='University Growth Over Decades',
              labels={'decade': 'Decade', 'count': 'Number of Universities'})
fig.show()

In [None]:
fig_university_types = px.histogram(df, x='found', color='type',
                                     labels={'founding_year': 'Year', 'count': 'Number of Universities'},
                                     title='Changes in University Types Over Time')

fig_university_types.show()