# Task Automation with Python Scripts

# Automate Data Cleaning in Python

In [1]:
import pandas as pd
import numpy as np
# Sample data with duplicates
data = {'Name': ['Anu', 'sonu', 'chachu', 'balu','Alice', 'Bob', 'Alice', 'Charlie'], 'Age': [32,8,9,27,25, 30, 25, 35],
        'dob':[16/11/1991,7/2/2017, 24/6/2016, 2/3/1998,2/5/1994,1/2/1992,2/5/1994,1/4/1989], 
        'Price': ['5', '10', '15','20','22','23', '22', '35'], 
        'Quantity': [2, 5, '3', '5', 4,6,4,'7'],'Scores': [np.nan, 88, 75, 92, 68,np.nan, 68,70]}
df = pd.DataFrame(data)
df
print(df)


      Name  Age       dob Price Quantity  Scores
0      Anu   32  0.000731     5        2     NaN
1     sonu    8  0.001735    10        5    88.0
2   chachu    9  0.001984    15        3    75.0
3     balu   27  0.000334    20        5    92.0
4    Alice   25  0.000201    22        4    68.0
5      Bob   30  0.000251    23        6     NaN
6    Alice   25  0.000201    22        4    68.0
7  Charlie   35  0.000126    35        7    70.0


# Eliminating Duplicate Data

In [2]:
# Removing duplicates
# Display the cleaned data
df = df.drop_duplicates()
df


Unnamed: 0,Name,Age,dob,Price,Quantity,Scores
0,Anu,32,0.000731,5,2,
1,sonu,8,0.001735,10,5,88.0
2,chachu,9,0.001984,15,3,75.0
3,balu,27,0.000334,20,5,92.0
4,Alice,25,0.000201,22,4,68.0
5,Bob,30,0.000251,23,6,
7,Charlie,35,0.000126,35,7,70.0


# Handling Missing Values

In [3]:
#count missing values
df.isnull().sum()

Name        0
Age         0
dob         0
Price       0
Quantity    0
Scores      2
dtype: int64

In [4]:
# Fill missing values with the median
df['Scores'].fillna(value=df['Scores'].mean(), inplace=True)
print("Fill with mean:\n", df)
print()

# Fill missing values with the median
df['Scores'].fillna(value=df['Scores'].median(), inplace=True)
print("Fill with median:\n", df)
print()

# Custom method: Fill with a predetermined value
df['Scores'].fillna(value=85, inplace=True)
print("Custom fill value:\n", df)

Fill with mean:
       Name  Age       dob Price Quantity  Scores
0      Anu   32  0.000731     5        2    78.6
1     sonu    8  0.001735    10        5    88.0
2   chachu    9  0.001984    15        3    75.0
3     balu   27  0.000334    20        5    92.0
4    Alice   25  0.000201    22        4    68.0
5      Bob   30  0.000251    23        6    78.6
7  Charlie   35  0.000126    35        7    70.0

Fill with median:
       Name  Age       dob Price Quantity  Scores
0      Anu   32  0.000731     5        2    78.6
1     sonu    8  0.001735    10        5    88.0
2   chachu    9  0.001984    15        3    75.0
3     balu   27  0.000334    20        5    92.0
4    Alice   25  0.000201    22        4    68.0
5      Bob   30  0.000251    23        6    78.6
7  Charlie   35  0.000126    35        7    70.0

Custom fill value:
       Name  Age       dob Price Quantity  Scores
0      Anu   32  0.000731     5        2    78.6
1     sonu    8  0.001735    10        5    88.0
2   chachu 

In [5]:
df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 7 entries, 0 to 7
Data columns (total 6 columns):
 #   Column    Non-Null Count  Dtype  
---  ------    --------------  -----  
 0   Name      7 non-null      object 
 1   Age       7 non-null      int64  
 2   dob       7 non-null      float64
 3   Price     7 non-null      object 
 4   Quantity  7 non-null      object 
 5   Scores    7 non-null      float64
dtypes: float64(2), int64(1), object(3)
memory usage: 392.0+ bytes


In [6]:
print(df.dtypes)

Name         object
Age           int64
dob         float64
Price        object
Quantity     object
Scores      float64
dtype: object


#  Automatically Detecting and Converting Data Types in Python

# infer_objects()

In [7]:
# importing pandas as pd 
import pandas as pd 
  
# Creating the dataframe  
df1 = pd.DataFrame({"A":["sofia", 5, 8, 11, 100], 
                   "B":[2, 8, 77, 4, 11], 
                   "C":["amy", 11, 4, 6, 9]}) 
  
# Print the dataframe 
df1 

Unnamed: 0,A,B,C
0,sofia,2,amy
1,5,8,11
2,8,77,4
3,11,4,6
4,100,11,9


In [8]:

# to print the basic info 
df1.info() 

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 5 entries, 0 to 4
Data columns (total 3 columns):
 #   Column  Non-Null Count  Dtype 
---  ------  --------------  ----- 
 0   A       5 non-null      object
 1   B       5 non-null      int64 
 2   C       5 non-null      object
dtypes: int64(1), object(2)
memory usage: 248.0+ bytes


In [9]:
# slice from the 1st row till end 
df_new = df1[1:] 
  
# Let's print the new data frame 
df_new 
  
# Now let's print the data type of the columns 
#df_new.info() 

Unnamed: 0,A,B,C
1,5,8,11
2,8,77,4
3,11,4,6
4,100,11,9


In [10]:
# Now let's print the data type of the columns 
df_new.info() 

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 4 entries, 1 to 4
Data columns (total 3 columns):
 #   Column  Non-Null Count  Dtype 
---  ------  --------------  ----- 
 0   A       4 non-null      object
 1   B       4 non-null      int64 
 2   C       4 non-null      object
dtypes: int64(1), object(2)
memory usage: 228.0+ bytes


In [11]:

# applying infer_objects() function. 
df_new = df_new.infer_objects() 
  
# Print the dtype after applying the function 
df_new.info() 

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 4 entries, 1 to 4
Data columns (total 3 columns):
 #   Column  Non-Null Count  Dtype
---  ------  --------------  -----
 0   A       4 non-null      int64
 1   B       4 non-null      int64
 2   C       4 non-null      int64
dtypes: int64(3)
memory usage: 228.0 bytes


# File organization with python

In [None]:
import os
import shutil
#Shutil module offers high-level operation on a file like a copy, create, and remote operation on the file.
path=input("enter path:")
files=os.listdir(path)
for file in files:
    filename,extension=os.path.splitext(file)
    extension=extension[1:]
    if os.path.exists(path+'/'+extension):
        shutil.move(path+'/'+file, path+'/'+extension+'/'+file)
    else:
        os.makedirs(path+'/'+extension)
        shutil.move(path+'/'+path+'/'+extension+'/'+file)
#os.path.exists() method in Python is used to check whether the specified path exists or not. This method can be also used to check whether the given path refers to an open file descriptor or not.
# os.makedirs()the os module raise OSError in the case of invalid or inaccessible file names and paths
#The os.listdir() method in Python is used to get the list of all files and directories in the specified directory. 