# Data Wrangling and File Handling in various format in Python
### Reading/Writiing text, TSV(Tab separated values), JSON files in Python

## 1. Text File

In [1]:
# Writing text file

In [2]:
with open('first.txt', 'w') as file:
    #file.write('This is sample data.')
    file.writelines('hello\nBrothers')
print('File created successfully')    

File created successfully


In [3]:
# using for loop for writing file
with open('first1.txt', 'w') as file:
    for i in range(1, 6):
        file.write(f'This is line {i}.\n')
print('File created successfully')    

File created successfully


In [4]:
# Reading text file

In [5]:
with open('first1.txt', 'r') as file:
    #data = file.read()   # read entire file content
    data = file.readlines()

for i in data:
    print(i.strip())

This is line 1.
This is line 2.
This is line 3.
This is line 4.
This is line 5.


## 2. TSV (Tab separated values)

In [6]:
# Writing Dataframe to TSV file
import pandas as pd
data = {
    'emp_id': [101, 102, 103],
    'emp_name': ['Rohit', 'Mohti', 'Sohit'],
    'emp_salary': [100000, 200000, 300000],
    'city': ['Mumbai', 'Delhi', 'Goa']
}
df = pd.DataFrame(data)
df

Unnamed: 0,emp_id,emp_name,emp_salary,city
0,101,Rohit,100000,Mumbai
1,102,Mohti,200000,Delhi
2,103,Sohit,300000,Goa


In [7]:
df.to_csv('tsv_file.tsv', sep = '\t', index = False)
print('File is created')

File is created


In [8]:
# Reading TSV file
df1 = pd.read_csv('tsv_file.tsv', sep = '\t')
df1

Unnamed: 0,emp_id,emp_name,emp_salary,city
0,101,Rohit,100000,Mumbai
1,102,Mohti,200000,Delhi
2,103,Sohit,300000,Goa


## 3. JSON file

In [9]:
data = {
    'employees':[
        {'name': 'Rohit', 'age': 23, 'department': 'IT'},
        {'name': 'Mohit', 'age': 28, 'department': 'HR'},
        {'name': 'Sohit', 'age': 25, 'department': 'Sales'}
    ]
}

In [10]:
# write data in json file
with open('JSON_example.json', 'w') as file:
    json.dump(data, file, indent = 4)    

In [11]:
# Reading JSON file
import json
with open('JSON_example.json', 'r') as file:
    data1 = json.load(file)

print(data1)    

{'employees': [{'name': 'Rohit', 'age': 23, 'department': 'IT'}, {'name': 'Mohit', 'age': 28, 'department': 'HR'}, {'name': 'Sohit', 'age': 25, 'department': 'Sales'}]}


In [12]:
print('Name\tAge\tDepartment')
for i in data1['employees']:
    print(i['name'], i['age'], i['department'], sep = '\t')

Name	Age	Department
Rohit	23	IT
Mohit	28	HR
Sohit	25	Sales


# Data Preprocessing: Scaling, Encoding, Normalization (with scikit learn)

## 1.Feature Scaling
### -Scaling adjust values so feature contributes equally

In [13]:
import pandas as pd
import numpy as np
from sklearn.preprocessing import MinMaxScaler, StandardScaler

In [14]:
num_data = pd.DataFrame(
    {
        'salary': [90000, 120000, 230000, 400000, 500000],
        'age': [23, 45, 37, 49, 56]
    }
)
num_data

Unnamed: 0,salary,age
0,90000,23
1,120000,45
2,230000,37
3,400000,49
4,500000,56


In [15]:
scaled = StandardScaler()
# scaled is instances of this class (standardScaler). With the help of this instances we call any function from teh class
rescaled = scaled.fit_transform(num_data)
df = pd.DataFrame(rescaled, columns = num_data.columns)

In [16]:
df

Unnamed: 0,salary,age
0,-1.120051,-1.679379
1,-0.931278,0.265165
2,-0.239112,-0.441942
3,0.830599,0.618718
4,1.459841,1.237437


In [25]:
# MinMaxScaler
minmax_scaler = MinMaxScaler(feature_range = (0, 1))
minmax_rescaled = minmax_scaler.fit_transform(num_data)
df1 = pd.DataFrame(minmax_rescaled, columns = num_data.columns)
round(df1, 2)

Unnamed: 0,salary,age
0,0.0,0.0
1,0.07,0.67
2,0.34,0.42
3,0.76,0.79
4,1.0,1.0


##### The most common method is Min-Max Scaling, which maps data into the range \([0,1]\) using the formula: 
#####        X_{norm}= (X-X_{min})/(X_{max}-X_{min}). 
##### Other techniques include mean normalization and max absolute scaling.

## Encoding Categorical Variables
#### -Convert text label into numeric format for ML model

In [28]:
from sklearn.preprocessing import LabelEncoder, OneHotEncoder


# The line from sklearn.preprocessing import LabelEncoder, 
# OneHotEncoder is the standard Python code for importing these two important classes from the scikit-learn library.
# These classes are used to convert categorical data into a numerical format that machine learning models can understand. 

In [47]:
cat_data = pd.DataFrame(
    {
        'Department': ['IT', 'HR', 'Sales', 'HR', 'IT', 'IT', 'Purchase']
    }
)
cat_data

Unnamed: 0,Department
0,IT
1,HR
2,Sales
3,HR
4,IT
5,IT
6,Purchase


### ⇨Label Encoding

In [44]:
label_encode = LabelEncoder()
cat_data['Cat_dept'] = label_encode.fit_transform(cat_data['Department'])
cat_data

Unnamed: 0,Department,Cat_dept
0,IT,1
1,HR,0
2,Sales,3
3,HR,0
4,IT,1
5,IT,1
6,Purchase,2


In [57]:
# fit(): learn all unique categories in column.
# transform: convert text into numeric(HR- 0, IT- 1). It assigns numbers to text alphabetically

### ⇨ One-Hot-Encoding

In [54]:
# cat_data1 = pd.get_dummies(cat_data, columns = ['Department'], prefix = 'Dept').astype(int)

In [55]:
cat_data1 = pd.get_dummies(cat_data['Department'], prefix = 'Dept')

In [56]:
cat_data1

Unnamed: 0,Dept_HR,Dept_IT,Dept_Purchase,Dept_Sales
0,False,True,False,False
1,True,False,False,False
2,False,False,False,True
3,True,False,False,False
4,False,True,False,False
5,False,True,False,False
6,False,False,True,False


In [58]:
# Label Encoding converts categories to integers (0, 1, 2...), ideal for ordinal data (e.g., Small, Medium, Large) or tree models, 
# while One-Hot Encoding creates new binary columns (0 or 1) for each category, perfect for nominal data (e.g., colors) or 
# linear models to avoid implying incorrect order, though it increases dimensionality.    

## ⇨ Normalization

In [59]:
from sklearn.preprocessing import Normalizer

# The line from sklearn.preprocessing import Normalizer successfully imports the Normalizer class from the sklearn.preprocessing module in Python. 
# This class is used to scale individual samples (rows of data) to have a unit norm (a length of one). 

In [64]:
X = np.array([[3.0, 4.0], [1.0, 0.0], [0.0, 0.8]])
print('Original Array:\n', X)

Original Array:
 [[3.  4. ]
 [1.  0. ]
 [0.  0.8]]


In [66]:
nor = Normalizer()
nor_data = nor.fit_transform(X)
print(nor_data)

[[0.6 0.8]
 [1.  0. ]
 [0.  1. ]]


### StandardScaler (The Default Choice): Works column-wise. It looks at one feature (e.g., "Age") across all people and ensures the mean is 0 and variance is 1. You use this so that features with large numbers (like Salary) don't "bully" features with small numbers (like Age) during model training.

### Normalizer (Your Code): Works row-wise. It looks at one person’s data and scales their attributes so the row vector has a length of 1. It doesn't care about the average age of the group; it only cares about the proportions within that specific row.

# CASE STUDY

In [96]:
raw_data = pd.DataFrame(
    {
        'CustomerID': [101, 102, 103, 104, 105],
        'Gender': ['Male', 'Female', 'Female', 'Male', 'Male'],
        'Age': [23, 45, 34, 67, 56],
        'Salary': [35000, 24000, 70000, 46000, 50000]
    }
)
raw_data

Unnamed: 0,CustomerID,Gender,Age,Salary
0,101,Male,23,35000
1,102,Female,45,24000
2,103,Female,34,70000
3,104,Male,67,46000
4,105,Male,56,50000


In [97]:
# customerid column
#data = raw_data.drop('CustomerID', axis = 1)
data = raw_data.drop(columns = 'CustomerID')
data

Unnamed: 0,Gender,Age,Salary
0,Male,23,35000
1,Female,45,24000
2,Female,34,70000
3,Male,67,46000
4,Male,56,50000


In [98]:
# Encode gender column
label_encoder = LabelEncoder()
data['Gender'] = label_encoder.fit_transform(data['Gender'])

In [99]:
data

Unnamed: 0,Gender,Age,Salary
0,1,23,35000
1,0,45,24000
2,0,34,70000
3,1,67,46000
4,1,56,50000


In [103]:
# Scale age and salary
scaled = MinMaxScaler()
rescale = scaled.fit_transform(data[['Age', 'Salary']])
data[['Age', 'Salary']] = rescale

In [104]:
data

Unnamed: 0,Gender,Age,Salary
0,1,0.0,0.23913
1,0,0.5,0.0
2,0,0.25,1.0
3,1,1.0,0.478261
4,1,0.75,0.565217
