# Importing Libraries

In [22]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

# Reading data

In [23]:
df = pd.read_csv('winequality-red.csv')
df

Unnamed: 0,fixed_acidity,volatile_acidity,citric_acid,residual_sugar,chlorides,free_sulfur_dioxide,total_sulfur_dioxide,density,pH,sulphates,alcohol,quality
0,7.4,0.700,0.00,1.9,0.076,11.0,34.0,0.99780,3.51,0.56,9.4,5
1,7.8,0.880,0.00,2.6,0.098,25.0,67.0,0.99680,3.20,0.68,9.8,5
2,7.8,0.760,0.04,2.3,0.092,15.0,54.0,0.99700,3.26,0.65,9.8,5
3,11.2,0.280,0.56,1.9,0.075,17.0,60.0,0.99800,3.16,0.58,9.8,6
4,7.4,0.700,0.00,1.9,0.076,11.0,34.0,0.99780,3.51,0.56,9.4,5
...,...,...,...,...,...,...,...,...,...,...,...,...
1594,6.2,0.600,0.08,2.0,0.090,32.0,44.0,0.99490,3.45,0.58,10.5,5
1595,5.9,0.550,0.10,2.2,0.062,39.0,51.0,0.99512,3.52,0.76,11.2,6
1596,6.3,0.510,0.13,2.3,0.076,29.0,40.0,0.99574,3.42,0.75,11.0,6
1597,5.9,0.645,0.12,2.0,0.075,32.0,44.0,0.99547,3.57,0.71,10.2,5


# Check for missing values

In [24]:
df.isnull().sum()

fixed_acidity           0
volatile_acidity        0
citric_acid             0
residual_sugar          0
chlorides               0
free_sulfur_dioxide     0
total_sulfur_dioxide    0
density                 0
pH                      0
sulphates               0
alcohol                 0
quality                 0
dtype: int64

# Categorise alcohol levels

In [25]:
#alcohol content 8-9 is low, 9-10 is medium low, 10-11 is medium, 11-12 is medium high, 12-13 is high, 13-14 is very high, 14-15 is extra high
df['alcohol'] = pd.cut(df['alcohol'], bins=[8, 9, 10, 11, 12, 13, 14, 15], labels=['low', 'medium low', 'medium','medium high', 'high', 'very high','extra high'])
df

Unnamed: 0,fixed_acidity,volatile_acidity,citric_acid,residual_sugar,chlorides,free_sulfur_dioxide,total_sulfur_dioxide,density,pH,sulphates,alcohol,quality
0,7.4,0.700,0.00,1.9,0.076,11.0,34.0,0.99780,3.51,0.56,medium low,5
1,7.8,0.880,0.00,2.6,0.098,25.0,67.0,0.99680,3.20,0.68,medium low,5
2,7.8,0.760,0.04,2.3,0.092,15.0,54.0,0.99700,3.26,0.65,medium low,5
3,11.2,0.280,0.56,1.9,0.075,17.0,60.0,0.99800,3.16,0.58,medium low,6
4,7.4,0.700,0.00,1.9,0.076,11.0,34.0,0.99780,3.51,0.56,medium low,5
...,...,...,...,...,...,...,...,...,...,...,...,...
1594,6.2,0.600,0.08,2.0,0.090,32.0,44.0,0.99490,3.45,0.58,medium,5
1595,5.9,0.550,0.10,2.2,0.062,39.0,51.0,0.99512,3.52,0.76,medium high,6
1596,6.3,0.510,0.13,2.3,0.076,29.0,40.0,0.99574,3.42,0.75,medium,6
1597,5.9,0.645,0.12,2.0,0.075,32.0,44.0,0.99547,3.57,0.71,medium,5


# Categorise citric acid

In [26]:
#citric acid content 0-0.25 is low, 0.25-0.5 is medium, 0.5-0.75 is high, 0.75-1 is very high
df['citric_acid'] = pd.cut(df['citric_acid'], bins=[-0.1,0.25,0.5,0.75,1], labels=['low','medium', 'high','very high'])
df

Unnamed: 0,fixed_acidity,volatile_acidity,citric_acid,residual_sugar,chlorides,free_sulfur_dioxide,total_sulfur_dioxide,density,pH,sulphates,alcohol,quality
0,7.4,0.700,low,1.9,0.076,11.0,34.0,0.99780,3.51,0.56,medium low,5
1,7.8,0.880,low,2.6,0.098,25.0,67.0,0.99680,3.20,0.68,medium low,5
2,7.8,0.760,low,2.3,0.092,15.0,54.0,0.99700,3.26,0.65,medium low,5
3,11.2,0.280,high,1.9,0.075,17.0,60.0,0.99800,3.16,0.58,medium low,6
4,7.4,0.700,low,1.9,0.076,11.0,34.0,0.99780,3.51,0.56,medium low,5
...,...,...,...,...,...,...,...,...,...,...,...,...
1594,6.2,0.600,low,2.0,0.090,32.0,44.0,0.99490,3.45,0.58,medium,5
1595,5.9,0.550,low,2.2,0.062,39.0,51.0,0.99512,3.52,0.76,medium high,6
1596,6.3,0.510,low,2.3,0.076,29.0,40.0,0.99574,3.42,0.75,medium,6
1597,5.9,0.645,low,2.0,0.075,32.0,44.0,0.99547,3.57,0.71,medium,5


# Categorise volatile acidity

In [27]:
#volatile acid content 0-0.4 is low, 0.4-0.8 is medium, 0.8-1.6 is high
df['volatile_acidity'] = pd.cut(df['volatile_acidity'], bins=[0,0.4,0.8,1.6], labels=['low','medium', 'high'])
df

Unnamed: 0,fixed_acidity,volatile_acidity,citric_acid,residual_sugar,chlorides,free_sulfur_dioxide,total_sulfur_dioxide,density,pH,sulphates,alcohol,quality
0,7.4,medium,low,1.9,0.076,11.0,34.0,0.99780,3.51,0.56,medium low,5
1,7.8,high,low,2.6,0.098,25.0,67.0,0.99680,3.20,0.68,medium low,5
2,7.8,medium,low,2.3,0.092,15.0,54.0,0.99700,3.26,0.65,medium low,5
3,11.2,low,high,1.9,0.075,17.0,60.0,0.99800,3.16,0.58,medium low,6
4,7.4,medium,low,1.9,0.076,11.0,34.0,0.99780,3.51,0.56,medium low,5
...,...,...,...,...,...,...,...,...,...,...,...,...
1594,6.2,medium,low,2.0,0.090,32.0,44.0,0.99490,3.45,0.58,medium,5
1595,5.9,medium,low,2.2,0.062,39.0,51.0,0.99512,3.52,0.76,medium high,6
1596,6.3,medium,low,2.3,0.076,29.0,40.0,0.99574,3.42,0.75,medium,6
1597,5.9,medium,low,2.0,0.075,32.0,44.0,0.99547,3.57,0.71,medium,5


# Load new data list to new .csv file

In [28]:
df.to_csv('new_winequality_red.csv', index=False)