In [1]:
import numpy as np
import pandas as pd

data = pd.read_csv('../data/news.tsv', 
                   sep="\t",
                   header=None)
data.columns=["newsid", "category", "subcategory", "address", "summry",
    "url", "entities", "concepts"]
print(data.head())


   newsid   category      subcategory  \
0  N55528  lifestyle  lifestyleroyals   
1  N19639     health       weightloss   
2  N61837       news        newsworld   
3  N53526     health           voices   
4  N38324     health          medical   

                                             address  \
0  The Brands Queen Elizabeth, Prince Charles, an...   
1                      50 Worst Habits For Belly Fat   
2  The Cost of Trump's Aid Freeze in the Trenches...   
3  I Was An NBA Wife. Here's How It Affected My M...   
4  How to Get Rid of Skin Tags, According to a De...   

                                              summry  \
0  Shop the notebooks, jackets, and more that the...   
1  These seemingly harmless habits are holding yo...   
2  Lt. Ivan Molchanets peeked over a parapet of s...   
3  I felt like I was a fraud, and being an NBA wi...   
4  They seem harmless, but there's a very good re...   

                                             url  \
0  https://assets.msn.com/l

In [2]:
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 51282 entries, 0 to 51281
Data columns (total 8 columns):
 #   Column       Non-Null Count  Dtype 
---  ------       --------------  ----- 
 0   newsid       51282 non-null  object
 1   category     51282 non-null  object
 2   subcategory  51282 non-null  object
 3   address      51282 non-null  object
 4   summry       48616 non-null  object
 5   url          51282 non-null  object
 6   entities     51279 non-null  object
 7   concepts     51278 non-null  object
dtypes: object(8)
memory usage: 3.1+ MB


In [3]:
data.isnull().sum()

newsid            0
category          0
subcategory       0
address           0
summry         2666
url               0
entities          3
concepts          4
dtype: int64

In [4]:
data.dropna(inplace=True)
data.isnull().sum()

newsid         0
category       0
subcategory    0
address        0
summry         0
url            0
entities       0
concepts       0
dtype: int64

In [5]:
data.duplicated().sum()

np.int64(0)

# Extract Text Features 
## Step Data preparation :
- important for TF-IDF
- The accuracy of the recommendations has increased.
- The training speed has improved.
- The system has become more flexible in handling new data.

In [6]:
# Translate all columne to string
data["content"] = data["address"].astype(str) + " " + data["summry"].astype(str)
# More clean 
import re

def pure_txt(txt):
    txt=txt.lower()
    txt = re.sub(r"[^\w\s]", "", txt)# يحفظ بالاحرف الابجديه و الارقام
    txt = re.sub(r"\s+", " ", txt).strip()# يقوم بمسح المسافات مثل  " hello world" => "hello world"
    txt = re.sub(r"'\w+", "", txt)# like "don't" => "dont"
    return txt

data["pure_content"]=data["content"].apply(pure_txt)
print(data["pure_content"].head())


0    the brands queen elizabeth prince charles and ...
1    50 worst habits for belly fat these seemingly ...
2    the cost of trumps aid freeze in the trenches ...
3    i was an nba wife heres how it affected my men...
4    how to get rid of skin tags according to a der...
Name: pure_content, dtype: object


In [7]:
data.to_csv("../data/pure_news.csv",index=False)

In [8]:
behavior_df = pd.read_csv("../data/behaviors.tsv",sep="\t", header=None)
behavior_df.columns=["behavior_id","user_id","timestamp","history","impressions"]
print(behavior_df.head())

   behavior_id user_id              timestamp  \
0            1  U13740  11/11/2019 9:05:58 AM   
1            2  U91836  11/12/2019 6:11:30 PM   
2            3  U73700  11/14/2019 7:01:48 AM   
3            4  U34670  11/11/2019 5:28:05 AM   
4            5   U8125  11/12/2019 4:11:21 PM   

                                             history  \
0  N55189 N42782 N34694 N45794 N18445 N63302 N104...   
1  N31739 N6072 N63045 N23979 N35656 N43353 N8129...   
2  N10732 N25792 N7563 N21087 N41087 N5445 N60384...   
3  N45729 N2203 N871 N53880 N41375 N43142 N33013 ...   
4                        N10078 N56514 N14904 N33740   

                                         impressions  
0                                  N55689-1 N35729-0  
1  N20678-0 N39317-0 N58114-0 N20495-0 N42977-0 N...  
2  N50014-0 N23877-0 N35389-0 N49712-0 N16844-0 N...  
3                N35729-0 N33632-0 N49685-1 N27581-0  
4  N39985-0 N36050-0 N16096-0 N8400-1 N22407-0 N6...  


In [9]:
behavior_df.isnull().sum()

behavior_id       0
user_id           0
timestamp         0
history        3238
impressions       0
dtype: int64

In [10]:
behavior_df.duplicated().sum()

np.int64(0)

In [11]:
behavior_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 156965 entries, 0 to 156964
Data columns (total 5 columns):
 #   Column       Non-Null Count   Dtype 
---  ------       --------------   ----- 
 0   behavior_id  156965 non-null  int64 
 1   user_id      156965 non-null  object
 2   timestamp    156965 non-null  object
 3   history      153727 non-null  object
 4   impressions  156965 non-null  object
dtypes: int64(1), object(4)
memory usage: 6.0+ MB


In [12]:
behavior_df.describe().sum()

behavior_id    673175.036839
dtype: float64

In [13]:
behavior_df.dropna(inplace=True)

In [14]:
behavior_df.to_csv("../data/clean_behaviors.csv", index=False)