In [2]:
import pandas as pd
from sklearn.utils import shuffle

### Loading and concatenating all datasets 

In [21]:
business_df = pd.read_csv('..\\data\\business_data.csv')
education_df = pd.read_csv('..\\data\\education_data.csv')
entertainment_df = pd.read_csv('..\\data\\entertainment_data.csv')
sports_df = pd.read_csv('..\\data\\sports_data.csv')
technology_df = pd.read_csv('..\\data\\technology_data.csv')

In [22]:
dfs = [business_df, education_df, entertainment_df, sports_df, technology_df]
df = pd.concat(dfs)

In [23]:
df['category'].unique()

array(['business', 'education', 'entertainment', 'sports', 'technology'],
      dtype=object)

In [24]:
df = shuffle(df)

In [25]:
df.reset_index(inplace=True)

In [26]:
df.head()

Unnamed: 0,index,headlines,description,content,url,category
0,1751,Australian Open semifinal: Unstoppable Novak D...,The odds are stacked heavily against first-tim...,Grand Slam fever grips tennis fans all over th...,https://indianexpress.com/article/sports/tenni...,sports
1,673,NTA changes JEE Main 2024 website; check new p...,JEE Main 2024: The new website also has an aut...,JEE Main 2024: The National Testing Agency (NT...,https://indianexpress.com/article/education/nt...,education
2,193,Guntur Kaaram movie review: Mahesh Babu tries ...,"Guntur Kaaram movie review: Mahesh Babu, in to...",Trivikram Srinivas’ Guntur Kaaram serves as de...,https://indianexpress.com/article/entertainmen...,entertainment
3,732,Our online tools will change the way you see s...,Jai Asundi spoke to indianexpress.com on the c...,Jai Asundi is the executive director of Bengal...,https://indianexpress.com/article/technology/t...,technology
4,1870,Busting myths regarding CLAT preparation,CLAT is designed to be a test of aptitude and ...,— Supratim Chakrabarty\nThe Common Law Admissi...,https://indianexpress.com/article/education/bu...,education


In [27]:
df.shape

(10000, 6)

### Droping unnecessery columns

In [28]:
df.drop(columns=['index', 'url'], inplace=True)

In [29]:
df.head(10)

Unnamed: 0,headlines,description,content,category
0,Australian Open semifinal: Unstoppable Novak D...,The odds are stacked heavily against first-tim...,Grand Slam fever grips tennis fans all over th...,sports
1,NTA changes JEE Main 2024 website; check new p...,JEE Main 2024: The new website also has an aut...,JEE Main 2024: The National Testing Agency (NT...,education
2,Guntur Kaaram movie review: Mahesh Babu tries ...,"Guntur Kaaram movie review: Mahesh Babu, in to...",Trivikram Srinivas’ Guntur Kaaram serves as de...,entertainment
3,Our online tools will change the way you see s...,Jai Asundi spoke to indianexpress.com on the c...,Jai Asundi is the executive director of Bengal...,technology
4,Busting myths regarding CLAT preparation,CLAT is designed to be a test of aptitude and ...,— Supratim Chakrabarty\nThe Common Law Admissi...,education
5,"Akshay Kumar bats for long-lasting fitness, ac...",Akshay Kumar was one of the many personalities...,One must focus on leading a fitter life and no...,entertainment
6,"Write-off era over, asset quality review serve...","Over the last five years, the gross non-perfor...","THE ERA OF loan write-offs is over, the asset ...",business
7,Life in an IIT: ‘How IIT Delhi shaped my caree...,"For Kanpur boy Amar Srivastava, studying at II...",—Amar Srivastava\nStudying at the Indian Insti...,education
8,Rupee rises 6 paise to 83.32 against US dollar...,Forex traders said the rupee fell to fresh rec...,The rupee recovered from its all-time low leve...,business
9,Will increase interest rates if cost of funds ...,"As per a recent SBI research report, the banki...",State Bank of India (SBI) chairman Dinesh Khar...,business


### Maping categoires to numbers and creating new column with category number

In [30]:
def map_category(category):
    category_map = {
        'business': 0,
        'education': 1,
        'entertainment': 2,
        'sports': 3,
        'technology': 4
    }
    return category_map.get(category)

In [31]:
map_category('business')

0

In [32]:
df['category_num'] = df['category'].apply(map_category)

In [33]:
df.head(10)

Unnamed: 0,headlines,description,content,category,category_num
0,Australian Open semifinal: Unstoppable Novak D...,The odds are stacked heavily against first-tim...,Grand Slam fever grips tennis fans all over th...,sports,3
1,NTA changes JEE Main 2024 website; check new p...,JEE Main 2024: The new website also has an aut...,JEE Main 2024: The National Testing Agency (NT...,education,1
2,Guntur Kaaram movie review: Mahesh Babu tries ...,"Guntur Kaaram movie review: Mahesh Babu, in to...",Trivikram Srinivas’ Guntur Kaaram serves as de...,entertainment,2
3,Our online tools will change the way you see s...,Jai Asundi spoke to indianexpress.com on the c...,Jai Asundi is the executive director of Bengal...,technology,4
4,Busting myths regarding CLAT preparation,CLAT is designed to be a test of aptitude and ...,— Supratim Chakrabarty\nThe Common Law Admissi...,education,1
5,"Akshay Kumar bats for long-lasting fitness, ac...",Akshay Kumar was one of the many personalities...,One must focus on leading a fitter life and no...,entertainment,2
6,"Write-off era over, asset quality review serve...","Over the last five years, the gross non-perfor...","THE ERA OF loan write-offs is over, the asset ...",business,0
7,Life in an IIT: ‘How IIT Delhi shaped my caree...,"For Kanpur boy Amar Srivastava, studying at II...",—Amar Srivastava\nStudying at the Indian Insti...,education,1
8,Rupee rises 6 paise to 83.32 against US dollar...,Forex traders said the rupee fell to fresh rec...,The rupee recovered from its all-time low leve...,business,0
9,Will increase interest rates if cost of funds ...,"As per a recent SBI research report, the banki...",State Bank of India (SBI) chairman Dinesh Khar...,business,0


### Creating content for training from headline, description and content

In [34]:
df['news_content'] = df['headlines'] + " " + df['description'] + " " + df['content']
df.drop(columns=['headlines', 'description', 'content'], inplace=True)

In [37]:
df.head()

Unnamed: 0,category,category_num,news_content
0,sports,3,Australian Open semifinal: Unstoppable Novak D...
1,education,1,NTA changes JEE Main 2024 website; check new p...
2,entertainment,2,Guntur Kaaram movie review: Mahesh Babu tries ...
3,technology,4,Our online tools will change the way you see s...
4,education,1,Busting myths regarding CLAT preparation CLAT ...


### Saving dataset in raw state (no cleaning and vectorization)

In [36]:
df.to_csv("../data/raw_data.csv")