In [19]:
import pandas as pd
#Import TfIdfVectorizer from scikit-learn
from sklearn.feature_extraction.text import TfidfVectorizer
# Import linear_kernel
from sklearn.metrics.pairwise import linear_kernel

## 1. Import data

In [20]:
data = pd.read_excel (r'./T4.xlsx')

In [24]:
#### Display the the number of empty values in the dataset

In [32]:
def total_of_missing_values(data):
    missing_data = data.isna().sum().sum()
    print("\nNumber of NaN values:", missing_data)
    #return missing_data

In [25]:
#### Display the the number and the percent of missing values for each column in the dataset

In [26]:
def missing_data(data):
    total = data.isnull().sum().sort_values(ascending=False)
    percent = (100*data.isnull().sum()/data.isnull().count()).sort_values(ascending=False)
    missing_data = pd.concat([total, percent], axis=1, keys=['Total', 'Percent'])
    print(missing_data[missing_data['Percent']>0])

In [27]:
#### Show the dataset shape

In [35]:
def show_data_dimensions(data):
    print('number of rows : '+str(data.shape[0])+', number of columns : '+str(data.shape[1]))
    #return data.shape[0]

In [36]:
#### Clean the data by dropping the duplicates rows and also by filling the empty values with previous values

In [37]:
def preprocessing(original_data):

    clean_data = original_data.copy()

    ### drop duplicate rows
    clean_data.drop_duplicates(keep='first', inplace=True)

    ### fill missing data and drop the remaing
    clean_data = clean_data.fillna(method='ffill')
    clean_data.dropna(axis=0, inplace=True)
    return clean_data

In [33]:
total_of_missing_values(data)


Number of NaN values: 1010258


In [34]:
missing_data(data)

                                     Total    Percent
function_call_interrupts_cpu0        68773  44.080453
function_call_interrupts_sum_cpu123  68773  44.080453
companion_sum_cpu123                 63667  40.807732
SLIMBUS_sum_cpu123                   63667  40.807732
volume_up_sum_cpu123                 63666  40.807092
...                                    ...        ...
Battery_online                          44   0.028202
Battery_level                           44   0.028202
Battery_invalid_charger                 44   0.028202
Battery_icon_small                      44   0.028202
Battery_health                          44   0.028202

[113 rows x 2 columns]


In [38]:
show_data_dimensions(data)

number of rows : 156017, number of columns : 116


In [8]:
data2 = preprocessing(data)

In [40]:
#Convert all the data to string so that we can use combination of columns
data2 = data2.applymap(str)

In [41]:
#concate columns to use them for recommendation
data2['version_Battery_level'] = data2['Version'] +"_"+data2['Battery_level']

In [42]:
###This bloc is used only because we have a large datasetand the memory cannot support this.
#if you have a powerful pc you can remove this bloc
data2 = data2.iloc[:20000]
len(data), len(data2)

(156017, 20000)

In [43]:
#Here we do some processing to remove all english stop words if we have in our data set
#Define a TF-IDF Vectorizer Object. Remove all english stop words such as 'the', 'a'
tfidf = TfidfVectorizer(stop_words='english')
#Replace NaN with an empty string
data2['version_Battery_level'] = data2['version_Battery_level'].fillna('')
#Construct the required TF-IDF matrix by fitting and transforming the data
tfidf_matrix = tfidf.fit_transform(data2['version_Battery_level'])
#Output the shape of tfidf_matrix
tfidf_matrix.shape

(20000, 109)

In [44]:
# Compute the cosine similarity matrix
cosine_sim = linear_kernel(tfidf_matrix, tfidf_matrix)
cosine_sim.shape

(20000, 20000)

In [45]:
#Construct a reverse map of indices and movie titles
indices = pd.Series(data2.index, index=data2['version_Battery_level']).drop_duplicates()

In [46]:
# Function that takes in version and Battery_level as input and outputs most top 10
def get_recommendations(version,Battery_level, cosine_sim=cosine_sim):
    version_Battery_level = version+'_'+str(Battery_level)
    # Get the index of the version_Battery_level that matches the version_Battery_level
    idx = indices[version_Battery_level]
    
    # Get the pairwsie similarity scores of all version_Battery_level with that version_Battery_level
    sim_scores = list(enumerate(cosine_sim[idx]))
    # Sort the version_Battery_level based on the similarity scores
    sim_scores = sorted(sim_scores, key=lambda x: max(x[1]), reverse=True)

    # Get the scores of the 10 most similar version_Battery_level
    sim_scores = sim_scores[1:11]

    # Get the version_Battery_level indices
    movie_indices = [i[0] for i in sim_scores]

    # Return the top 10 most similar version_Battery_level
    return data2.iloc[movie_indices]

In [47]:
get_recommendations('2.3.1',61.0)

IndexError: index 26204 is out of bounds for axis 0 with size 20000

In [18]:
#data2.to_excel("data2.xlsx") 

In [12]:
#data2 = data2.drop(['Version','Battery_level','tot_idle','tot_iowait','tot_irq','tot_softirq','ctxt','btime','processes','procs_running','procs_blocked'],axis=1)
#data2 = data2.drop(['SwapTotal','SwapFree','Dirty','Writeback','AnonPages','Mapped','Shmem','Slab'],axis=1)
#data2 = data2.drop(['SReclaimable','SUnreclaim','KernelStack','PageTables','CommitLimit','Committed_AS','VmallocTotal','VmallocUsed','VmallocChunk'],axis=1)

In [13]:
len(data2['version_Battery_level'].unique())

2

In [None]:
columns = data2.columns
for column in columns:
    print(column, len(data2[column].unique()))

In [14]:


#Define a TF-IDF Vectorizer Object. Remove all english stop words such as 'the', 'a'
tfidf = TfidfVectorizer(stop_words='english')

#Replace NaN with an empty string
data2['version_Battery_level'] = data2['version_Battery_level'].fillna('')

#Construct the required TF-IDF matrix by fitting and transforming the data
tfidf_matrix = tfidf.fit_transform(data2['version_Battery_level'])

#Output the shape of tfidf_matrix
tfidf_matrix.shape


(20000, 17)

In [15]:


# Compute the cosine similarity matrix
cosine_sim = linear_kernel(tfidf_matrix, tfidf_matrix)

In [16]:
cosine_sim.shape

(20000, 20000)

In [36]:
data2['version_Battery_level']

0        2.3.1_0         61.0\n2         61.0\n12      ...
2        2.3.1_0         61.0\n2         61.0\n12      ...
12       2.3.1_0         61.0\n2         61.0\n12      ...
13       2.3.1_0         61.0\n2         61.0\n12      ...
14       2.3.1_0         61.0\n2         61.0\n12      ...
                               ...                        
89248    2.3.2_0         61.0\n2         61.0\n12      ...
89251    2.3.2_0         61.0\n2         61.0\n12      ...
89253    2.3.2_0         61.0\n2         61.0\n12      ...
89254    2.3.2_0         61.0\n2         61.0\n12      ...
89262    2.3.2_0         61.0\n2         61.0\n12      ...
Name: version_Battery_level, Length: 20000, dtype: object

In [37]:
#Construct a reverse map of indices and movie titles
indices = pd.Series(data2['version_Battery_level'].index, index=data2['version_Battery_level']).drop_duplicates()

In [40]:
(indices[:1])

version_Battery_level
2.3.1_0         61.0\n2         61.0\n12        61.0\n13        61.0\n14        61.0\n          ... \n156004    98.0\n156006    98.0\n156007    98.0\n156014    98.0\n156015    98.0\nName: Battery_level, Length: 61158, dtype: float64    0
dtype: int64

In [19]:
# Function that takes in movie title as input and outputs most similar movies
def get_recommendations(version,Battery_level, cosine_sim=cosine_sim):
    # Get the index of the movie that matches the title
    version_Battery_level = version+'_'+str(Battery_level)
    idx = indices[version_Battery_level]
    # Get the pairwsie similarity scores of all movies with that movie
    sim_scores = list(enumerate(cosine_sim[idx]))

    # Sort the movies based on the similarity scores
    sim_scores = sorted(sim_scores, key=lambda x: x[1], reverse=True)

    # Get the scores of the 10 most similar movies
    sim_scores = sim_scores[1:11]

    # Get the movie indices
    movie_indices = [i[0] for i in sim_scores]

    # Return the top 10 most similar movies
    return data2.iloc[movie_indices]

In [20]:
get_recommendations('2.3.2',93)

KeyError: '2.3.2_93'