In [2]:
import pandas as pd
from sklearn.utils import shuffle

### Loading and concatenating all datasets 

In [3]:
business_df = pd.read_csv('..\\data\\business_data.csv')
education_df = pd.read_csv('..\\data\\education_data.csv')
entertainment_df = pd.read_csv('..\\data\\entertainment_data.csv')
sports_df = pd.read_csv('..\\data\\sports_data.csv')
technology_df = pd.read_csv('..\\data\\technology_data.csv')

In [4]:
dfs = [business_df, education_df, entertainment_df, sports_df, technology_df]
df = pd.concat(dfs)

In [5]:
df['category'].unique()

array(['business', 'education', 'entertainment', 'sports', 'technology'],
      dtype=object)

In [6]:
df = shuffle(df)

In [7]:
df.reset_index(inplace=True)

In [8]:
df.head()

Unnamed: 0,index,headlines,description,content,url,category
0,688,"Following Abhinav Bindra’s footsteps, Rudrankk...","Despite high fever, which made his body shiver...",Rudrankksh Patil took two trips to Germany see...,https://indianexpress.com/article/sports/follo...,sports
1,381,‘What a finish’: Congratulatory messages pour ...,"Then in the last, dramatic moment, Anderson wa...",New Zealand completed a dramatic fightback on ...,https://indianexpress.com/article/sports/crick...,sports
2,1992,"Microsoft Paint gets Cocreator, a DALL-E power...",The new 'Paint Cocreator' uses DALL-E to gener...,Microsoft’s Paint app on Windows 11 has been g...,https://indianexpress.com/article/technology/t...,technology
3,865,Riot Games announces its first-ever Valorant e...,Top Valorant teams from around the world are c...,American video game developer Riot Games has a...,https://indianexpress.com/article/technology/g...,technology
4,981,"Prithviraj Sukumaran on what worked for Jawan,...",Prithviraj Sukumaran shared insights into his ...,Prithviraj Sukumaran is currently busy with th...,https://indianexpress.com/article/entertainmen...,entertainment


In [9]:
df.shape

(10000, 6)

### Droping unnecessery columns

In [10]:
df.drop(columns=['index', 'url'], inplace=True)

In [11]:
df.head(10)

Unnamed: 0,headlines,description,content,category
0,"Following Abhinav Bindra’s footsteps, Rudrankk...","Despite high fever, which made his body shiver...",Rudrankksh Patil took two trips to Germany see...,sports
1,‘What a finish’: Congratulatory messages pour ...,"Then in the last, dramatic moment, Anderson wa...",New Zealand completed a dramatic fightback on ...,sports
2,"Microsoft Paint gets Cocreator, a DALL-E power...",The new 'Paint Cocreator' uses DALL-E to gener...,Microsoft’s Paint app on Windows 11 has been g...,technology
3,Riot Games announces its first-ever Valorant e...,Top Valorant teams from around the world are c...,American video game developer Riot Games has a...,technology
4,"Prithviraj Sukumaran on what worked for Jawan,...",Prithviraj Sukumaran shared insights into his ...,Prithviraj Sukumaran is currently busy with th...,entertainment
5,When Naga Chaitanya told dad Nagarjuna about h...,Naga Chaitanya's acting ambitions started with...,Telugu actor Naga Chaitanya comes from a talen...,entertainment
6,IND vs AUS: Mike Hussey wants Australian batsm...,Australian batters have been heavily criticize...,Former Australian batsman Mike Hussey urged th...,sports
7,YouTube testing ChatGPT-like AI powered chatbo...,YouTube is testing new generative AI powered f...,YouTube is testing new generative AI features ...,technology
8,Reliance’s share price scales fresh 52-week hi...,"""Tata Group's multi-billion-pound investment i...",Tata Sons will build a 40GW battery cell gigaf...,business
9,RBI to bring digital loan aggregators under re...,Digital lenders have been accused of charging ...,The Reserve Bank of India (RBI) has decided to...,business


### Maping categoires to numbers and creating new column with category number

In [12]:
def map_category(category):
    category_map = {
        'business': 0,
        'education': 1,
        'entertainment': 2,
        'sports': 3,
        'technology': 4
    }
    return category_map.get(category)

In [13]:
map_category('business')

0

In [14]:
df['news_category'] = df['category'].apply(map_category)

In [15]:
df.drop(columns=['category'], inplace=True)

In [16]:
df.head(10)

Unnamed: 0,headlines,description,content,news_category
0,"Following Abhinav Bindra’s footsteps, Rudrankk...","Despite high fever, which made his body shiver...",Rudrankksh Patil took two trips to Germany see...,3
1,‘What a finish’: Congratulatory messages pour ...,"Then in the last, dramatic moment, Anderson wa...",New Zealand completed a dramatic fightback on ...,3
2,"Microsoft Paint gets Cocreator, a DALL-E power...",The new 'Paint Cocreator' uses DALL-E to gener...,Microsoft’s Paint app on Windows 11 has been g...,4
3,Riot Games announces its first-ever Valorant e...,Top Valorant teams from around the world are c...,American video game developer Riot Games has a...,4
4,"Prithviraj Sukumaran on what worked for Jawan,...",Prithviraj Sukumaran shared insights into his ...,Prithviraj Sukumaran is currently busy with th...,2
5,When Naga Chaitanya told dad Nagarjuna about h...,Naga Chaitanya's acting ambitions started with...,Telugu actor Naga Chaitanya comes from a talen...,2
6,IND vs AUS: Mike Hussey wants Australian batsm...,Australian batters have been heavily criticize...,Former Australian batsman Mike Hussey urged th...,3
7,YouTube testing ChatGPT-like AI powered chatbo...,YouTube is testing new generative AI powered f...,YouTube is testing new generative AI features ...,4
8,Reliance’s share price scales fresh 52-week hi...,"""Tata Group's multi-billion-pound investment i...",Tata Sons will build a 40GW battery cell gigaf...,0
9,RBI to bring digital loan aggregators under re...,Digital lenders have been accused of charging ...,The Reserve Bank of India (RBI) has decided to...,0


### Creating content for training from headline, description and content

In [17]:
df['news_content'] = df['headlines'] + " " + df['description'] + " " + df['content']
df.drop(columns=['headlines', 'description', 'content'], inplace=True)

In [18]:
df.head(10)

Unnamed: 0,news_category,news_content
0,3,"Following Abhinav Bindra’s footsteps, Rudrankk..."
1,3,‘What a finish’: Congratulatory messages pour ...
2,4,"Microsoft Paint gets Cocreator, a DALL-E power..."
3,4,Riot Games announces its first-ever Valorant e...
4,2,"Prithviraj Sukumaran on what worked for Jawan,..."
5,2,When Naga Chaitanya told dad Nagarjuna about h...
6,3,IND vs AUS: Mike Hussey wants Australian batsm...
7,4,YouTube testing ChatGPT-like AI powered chatbo...
8,0,Reliance’s share price scales fresh 52-week hi...
9,0,RBI to bring digital loan aggregators under re...


### Saving dataset in raw state (no cleaning and vectorization)

In [19]:
df.to_csv("../data/raw_data.csv")