In [47]:
import pandas as pd
import seaborn as sns
from matplotlib import pyplot as plt


In [48]:
df=pd.read_csv("List of Countries by Sugarcane Production.csv")

In [None]:
df.shape

In [None]:
df.head()

Data Cleaning

In [51]:
df["Production (Tons)"]=df["Production (Tons)"].str.replace(".","")
df["Production per Person (Kg)"] = df["Production per Person (Kg)"].str.replace(".","").str.replace(",",".")
df["Acreage (Hectare)"] = df["Acreage (Hectare)"].str.replace(".","")
df["Yield (Kg / Hectare)"]= df["Yield (Kg / Hectare)"].str.replace(".","").str.replace(",",".")


In [None]:
df.head()

In [53]:
df = df.drop( "Unnamed: 0", axis = 1)

In [54]:
df.rename(columns= {"Production (Tons)": "Production"}, inplace = True)
df.rename(columns= {"Production per Person (Kg)": "Production_per_person"},inplace = True)
df.rename(columns= {"Acreage (Hectare)": "Acreage"}, inplace = True)
df.rename(columns= {"Yield (Kg / Hectare)": "Yield"}, inplace = True)

In [None]:
df.head()

In [None]:
df.isnull().sum()

In [None]:
df[df["Acreage"].isnull()]

In [58]:
df=df.dropna().reset_index().drop("index",axis=1)


In [None]:
df

In [None]:
df.nunique()

In [None]:
df.dtypes

In [62]:
df["Production"] = df["Production"].astype(float)
df["Production_per_person"] = df["Production_per_person"].astype(float)
df["Acreage"] = df["Acreage"].astype(float)
df["Yield"] = df["Yield"].astype(float)

In [None]:
df.dtypes

Univariate Analysis

In [None]:
df.head()

Continent producing sugarcane

In [None]:
df["Continent"].value_counts()

In [None]:
df["Continent"].value_counts().plot(kind="bar")

In [None]:
df.describe()

Checking Outliers

In [None]:
plt.figure(figsize = (10,8))
plt.subplot(2,2,1)
sns.boxplot(df["Production"])
plt.title("Production")
plt.subplot(2,2,2)
sns.boxplot(df["Production_per_person"])
plt.title("Production_per_person")
plt.subplot(2,2,3)
sns.boxplot(df["Acreage"])
plt.title("Acreage")
plt.subplot(2,2,4)
sns.boxplot(df["Yield"])
plt.title("Yield")
plt.show()

Distribution of Columns

In [None]:
plt.figure(figsize = (10,10))
plt.subplot(2,2,1)
sns.distplot(df["Production"])
plt.title("Production")
plt.subplot(2,2,2)
sns.distplot(df["Production_per_person"])
plt.title("Production_per_person")
plt.subplot(2,2,3)
sns.distplot(df["Acreage"])
plt.title("Acreage")
plt.subplot(2,2,4)
sns.distplot(df["Yield"])
plt.title("Yield")
plt.show()

In [None]:
sns.violinplot(df["Production"])

In [None]:
df.head()

In [72]:
df_new=df[["Country","Production"]].set_index("Country")

In [None]:
df_new

In [74]:
df_new["Production_percent"] = df_new["Production"]*100/df_new["Production"].sum()

In [None]:
df_new

In [None]:
df_new["Production_percent"].plot(kind="pie",autopct="%.2f")

In [None]:
df[["Country","Production"]].set_index("Country").sort_values("Production",ascending = False).head(15).plot(kind = "bar")

In [None]:
ax = sns.barplot(data = df.head(15), x= "Country", y = "Production")
ax.set_xticks(ax.get_xticks())
ax.set_xticklabels(ax.get_xticklabels(),rotation =90)
plt.show()

Country with biggest land

In [None]:
df_acr = df.sort_values("Acreage", ascending = False).head(15)
ax = sns.barplot(data = df_acr, x= "Country", y = "Acreage")
ax.set_xticklabels(ax.get_xticklabels(),rotation =90)
plt.show()

Highest yield per Hectare

In [None]:
df_yield = df.sort_values("Yield", ascending = False).head(15)
ax = sns.barplot(data = df_yield, x= "Country", y = "Yield")
ax.set_xticklabels(ax.get_xticklabels(),rotation =90)
plt.show()

Highest Production

In [None]:
df_yield = df.sort_values("Production_per_person", ascending = False).head(15)
ax = sns.barplot(data = df_yield, x= "Country", y = "Production_per_person")
ax.set_xticklabels(ax.get_xticklabels(),rotation =90)
plt.show()

Do countries with highest land produce more sugarcane

In [None]:
sns.scatterplot(data=df,x="Acreage",y="Production",hue="Continent")

Do countries which yield more sugarcane per hectare produces more  sugarcane 

In [None]:
sns.scatterplot(data=df,x="Yield",y="Production",hue="Continent")

In [None]:
df.head()

Analysis for continent

In [85]:
df_continent=df.groupby("Continent").sum()


In [86]:
df_continent["number_of_countries"]=df.groupby("Continent").count()["Country"]

In [None]:
df_continent

Which continent produces maximum sugarcane

In [None]:
df_continent["Production"].sort_values(ascending=False).plot(kind="bar")

Do number of countries in a continent effect production of sugarcane

In [None]:
continent_names = df_continent.index.to_list()
sns.lineplot(data = df_continent,x = "number_of_countries", y= "Production" )
plt.xticks(df_continent["number_of_countries"], continent_names, rotation =90)
plt.show()

Do continent with highest land produces more sugarcane

In [None]:
sns.lineplot(data=df_continent,x="Acreage",y="Production")

Production distribution by continent 

In [None]:
df_continent["Production"].plot(kind = "pie", autopct = "%.2f%%")
plt.title('Production Distribution by Continent')
plt.show()