|
| 1 | +{ |
| 2 | + "cells": [ |
| 3 | + { |
| 4 | + "cell_type": "markdown", |
| 5 | + "metadata": {}, |
| 6 | + "source": [ |
| 7 | + "### 1.\tImport the required dataframe and load the dataset into the pandas dataframe.:" |
| 8 | + ] |
| 9 | + }, |
| 10 | + { |
| 11 | + "cell_type": "code", |
| 12 | + "execution_count": 34, |
| 13 | + "metadata": {}, |
| 14 | + "outputs": [], |
| 15 | + "source": [ |
| 16 | + "import pandas as pd\n", |
| 17 | + "import numpy as np\n", |
| 18 | + "df = pd.read_csv(\"../Data/student.csv\")" |
| 19 | + ] |
| 20 | + }, |
| 21 | + { |
| 22 | + "cell_type": "markdown", |
| 23 | + "metadata": {}, |
| 24 | + "source": [ |
| 25 | + "### 2.\tFind the categorical column and separate out with different dataframe. To do so, use select_dtypes() function from pandas dataframe" |
| 26 | + ] |
| 27 | + }, |
| 28 | + { |
| 29 | + "cell_type": "code", |
| 30 | + "execution_count": 35, |
| 31 | + "metadata": { |
| 32 | + "scrolled": true |
| 33 | + }, |
| 34 | + "outputs": [ |
| 35 | + { |
| 36 | + "data": { |
| 37 | + "text/html": [ |
| 38 | + "<div>\n", |
| 39 | + "<style scoped>\n", |
| 40 | + " .dataframe tbody tr th:only-of-type {\n", |
| 41 | + " vertical-align: middle;\n", |
| 42 | + " }\n", |
| 43 | + "\n", |
| 44 | + " .dataframe tbody tr th {\n", |
| 45 | + " vertical-align: top;\n", |
| 46 | + " }\n", |
| 47 | + "\n", |
| 48 | + " .dataframe thead th {\n", |
| 49 | + " text-align: right;\n", |
| 50 | + " }\n", |
| 51 | + "</style>\n", |
| 52 | + "<table border=\"1\" class=\"dataframe\">\n", |
| 53 | + " <thead>\n", |
| 54 | + " <tr style=\"text-align: right;\">\n", |
| 55 | + " <th></th>\n", |
| 56 | + " <th>Gender</th>\n", |
| 57 | + " <th>Grade</th>\n", |
| 58 | + " <th>Employed</th>\n", |
| 59 | + " </tr>\n", |
| 60 | + " </thead>\n", |
| 61 | + " <tbody>\n", |
| 62 | + " <tr>\n", |
| 63 | + " <th>0</th>\n", |
| 64 | + " <td>Male</td>\n", |
| 65 | + " <td>1st Class</td>\n", |
| 66 | + " <td>yes</td>\n", |
| 67 | + " </tr>\n", |
| 68 | + " <tr>\n", |
| 69 | + " <th>1</th>\n", |
| 70 | + " <td>Female</td>\n", |
| 71 | + " <td>2nd Class</td>\n", |
| 72 | + " <td>no</td>\n", |
| 73 | + " </tr>\n", |
| 74 | + " <tr>\n", |
| 75 | + " <th>2</th>\n", |
| 76 | + " <td>Male</td>\n", |
| 77 | + " <td>1st Class</td>\n", |
| 78 | + " <td>no</td>\n", |
| 79 | + " </tr>\n", |
| 80 | + " <tr>\n", |
| 81 | + " <th>3</th>\n", |
| 82 | + " <td>Female</td>\n", |
| 83 | + " <td>2nd Class</td>\n", |
| 84 | + " <td>no</td>\n", |
| 85 | + " </tr>\n", |
| 86 | + " <tr>\n", |
| 87 | + " <th>4</th>\n", |
| 88 | + " <td>Male</td>\n", |
| 89 | + " <td>1st Class</td>\n", |
| 90 | + " <td>no</td>\n", |
| 91 | + " </tr>\n", |
| 92 | + " </tbody>\n", |
| 93 | + "</table>\n", |
| 94 | + "</div>" |
| 95 | + ], |
| 96 | + "text/plain": [ |
| 97 | + " Gender Grade Employed\n", |
| 98 | + "0 Male 1st Class yes\n", |
| 99 | + "1 Female 2nd Class no\n", |
| 100 | + "2 Male 1st Class no\n", |
| 101 | + "3 Female 2nd Class no\n", |
| 102 | + "4 Male 1st Class no" |
| 103 | + ] |
| 104 | + }, |
| 105 | + "execution_count": 35, |
| 106 | + "metadata": {}, |
| 107 | + "output_type": "execute_result" |
| 108 | + } |
| 109 | + ], |
| 110 | + "source": [ |
| 111 | + "df_categorical = df.select_dtypes(exclude=np.number)\n", |
| 112 | + "df_categorical.head()" |
| 113 | + ] |
| 114 | + }, |
| 115 | + { |
| 116 | + "cell_type": "markdown", |
| 117 | + "metadata": {}, |
| 118 | + "source": [ |
| 119 | + "### 3.\tFind the distinct unique values in a Grade column. To do so, use unique() function from pandas dataframe over the column name." |
| 120 | + ] |
| 121 | + }, |
| 122 | + { |
| 123 | + "cell_type": "code", |
| 124 | + "execution_count": 36, |
| 125 | + "metadata": {}, |
| 126 | + "outputs": [ |
| 127 | + { |
| 128 | + "data": { |
| 129 | + "text/plain": [ |
| 130 | + "array(['1st Class', '2nd Class', '3rd Class'], dtype=object)" |
| 131 | + ] |
| 132 | + }, |
| 133 | + "execution_count": 36, |
| 134 | + "metadata": {}, |
| 135 | + "output_type": "execute_result" |
| 136 | + } |
| 137 | + ], |
| 138 | + "source": [ |
| 139 | + "df_categorical['Grade'].unique()" |
| 140 | + ] |
| 141 | + }, |
| 142 | + { |
| 143 | + "cell_type": "markdown", |
| 144 | + "metadata": {}, |
| 145 | + "source": [ |
| 146 | + "### 4.\tFind the frequency distribution of each categorical column. To do so, use value_counts() function on each column." |
| 147 | + ] |
| 148 | + }, |
| 149 | + { |
| 150 | + "cell_type": "code", |
| 151 | + "execution_count": 37, |
| 152 | + "metadata": {}, |
| 153 | + "outputs": [ |
| 154 | + { |
| 155 | + "data": { |
| 156 | + "text/plain": [ |
| 157 | + "2nd Class 80\n", |
| 158 | + "3rd Class 80\n", |
| 159 | + "1st Class 72\n", |
| 160 | + "Name: Grade, dtype: int64" |
| 161 | + ] |
| 162 | + }, |
| 163 | + "execution_count": 37, |
| 164 | + "metadata": {}, |
| 165 | + "output_type": "execute_result" |
| 166 | + } |
| 167 | + ], |
| 168 | + "source": [ |
| 169 | + "df_categorical.Grade.value_counts()" |
| 170 | + ] |
| 171 | + }, |
| 172 | + { |
| 173 | + "cell_type": "code", |
| 174 | + "execution_count": 38, |
| 175 | + "metadata": {}, |
| 176 | + "outputs": [ |
| 177 | + { |
| 178 | + "data": { |
| 179 | + "text/plain": [ |
| 180 | + "Male 136\n", |
| 181 | + "Female 96\n", |
| 182 | + "Name: Gender, dtype: int64" |
| 183 | + ] |
| 184 | + }, |
| 185 | + "execution_count": 38, |
| 186 | + "metadata": {}, |
| 187 | + "output_type": "execute_result" |
| 188 | + } |
| 189 | + ], |
| 190 | + "source": [ |
| 191 | + "df_categorical.Gender.value_counts()" |
| 192 | + ] |
| 193 | + }, |
| 194 | + { |
| 195 | + "cell_type": "code", |
| 196 | + "execution_count": 39, |
| 197 | + "metadata": {}, |
| 198 | + "outputs": [ |
| 199 | + { |
| 200 | + "data": { |
| 201 | + "text/plain": [ |
| 202 | + "no 133\n", |
| 203 | + "yes 99\n", |
| 204 | + "Name: Employed, dtype: int64" |
| 205 | + ] |
| 206 | + }, |
| 207 | + "execution_count": 39, |
| 208 | + "metadata": {}, |
| 209 | + "output_type": "execute_result" |
| 210 | + } |
| 211 | + ], |
| 212 | + "source": [ |
| 213 | + "df_categorical.Employed.value_counts()" |
| 214 | + ] |
| 215 | + }, |
| 216 | + { |
| 217 | + "cell_type": "markdown", |
| 218 | + "metadata": {}, |
| 219 | + "source": [ |
| 220 | + "### 5.\tReplace the number in the Grade column. ‘1st class’ with ‘1’, ‘2nd class’ with ‘2’ and ‘3rd class’ with ‘3’. To do so, use replace() function with the data frame column " |
| 221 | + ] |
| 222 | + }, |
| 223 | + { |
| 224 | + "cell_type": "code", |
| 225 | + "execution_count": 40, |
| 226 | + "metadata": {}, |
| 227 | + "outputs": [ |
| 228 | + { |
| 229 | + "name": "stderr", |
| 230 | + "output_type": "stream", |
| 231 | + "text": [ |
| 232 | + "/anaconda3/lib/python3.6/site-packages/pandas/core/generic.py:6586: SettingWithCopyWarning: \n", |
| 233 | + "A value is trying to be set on a copy of a slice from a DataFrame\n", |
| 234 | + "\n", |
| 235 | + "See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy\n", |
| 236 | + " self._update_inplace(new_data)\n" |
| 237 | + ] |
| 238 | + } |
| 239 | + ], |
| 240 | + "source": [ |
| 241 | + "df_categorical.Grade.replace({\"1st Class\":1, \"2nd Class\":2, \"3rd Class\":3}, inplace= True)" |
| 242 | + ] |
| 243 | + }, |
| 244 | + { |
| 245 | + "cell_type": "code", |
| 246 | + "execution_count": 41, |
| 247 | + "metadata": {}, |
| 248 | + "outputs": [ |
| 249 | + { |
| 250 | + "name": "stderr", |
| 251 | + "output_type": "stream", |
| 252 | + "text": [ |
| 253 | + "/anaconda3/lib/python3.6/site-packages/pandas/core/generic.py:5096: SettingWithCopyWarning: \n", |
| 254 | + "A value is trying to be set on a copy of a slice from a DataFrame.\n", |
| 255 | + "Try using .loc[row_indexer,col_indexer] = value instead\n", |
| 256 | + "\n", |
| 257 | + "See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy\n", |
| 258 | + " self[name] = value\n" |
| 259 | + ] |
| 260 | + } |
| 261 | + ], |
| 262 | + "source": [ |
| 263 | + "df_categorical.Gender.replace({\"Male\":0,\"Female\":1}, inplace= True)" |
| 264 | + ] |
| 265 | + }, |
| 266 | + { |
| 267 | + "cell_type": "code", |
| 268 | + "execution_count": 24, |
| 269 | + "metadata": {}, |
| 270 | + "outputs": [ |
| 271 | + { |
| 272 | + "name": "stderr", |
| 273 | + "output_type": "stream", |
| 274 | + "text": [ |
| 275 | + "/anaconda3/lib/python3.6/site-packages/pandas/core/generic.py:6586: SettingWithCopyWarning: \n", |
| 276 | + "A value is trying to be set on a copy of a slice from a DataFrame\n", |
| 277 | + "\n", |
| 278 | + "See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy\n", |
| 279 | + " self._update_inplace(new_data)\n" |
| 280 | + ] |
| 281 | + } |
| 282 | + ], |
| 283 | + "source": [ |
| 284 | + "df_categorical.Employed.replace({\"yes\":1,\"no\":0}, inplace = True)" |
| 285 | + ] |
| 286 | + }, |
| 287 | + { |
| 288 | + "cell_type": "code", |
| 289 | + "execution_count": 25, |
| 290 | + "metadata": {}, |
| 291 | + "outputs": [ |
| 292 | + { |
| 293 | + "data": { |
| 294 | + "text/html": [ |
| 295 | + "<div>\n", |
| 296 | + "<style scoped>\n", |
| 297 | + " .dataframe tbody tr th:only-of-type {\n", |
| 298 | + " vertical-align: middle;\n", |
| 299 | + " }\n", |
| 300 | + "\n", |
| 301 | + " .dataframe tbody tr th {\n", |
| 302 | + " vertical-align: top;\n", |
| 303 | + " }\n", |
| 304 | + "\n", |
| 305 | + " .dataframe thead th {\n", |
| 306 | + " text-align: right;\n", |
| 307 | + " }\n", |
| 308 | + "</style>\n", |
| 309 | + "<table border=\"1\" class=\"dataframe\">\n", |
| 310 | + " <thead>\n", |
| 311 | + " <tr style=\"text-align: right;\">\n", |
| 312 | + " <th></th>\n", |
| 313 | + " <th>Gender</th>\n", |
| 314 | + " <th>Grade</th>\n", |
| 315 | + " <th>Employed</th>\n", |
| 316 | + " </tr>\n", |
| 317 | + " </thead>\n", |
| 318 | + " <tbody>\n", |
| 319 | + " <tr>\n", |
| 320 | + " <th>0</th>\n", |
| 321 | + " <td>0</td>\n", |
| 322 | + " <td>1</td>\n", |
| 323 | + " <td>1</td>\n", |
| 324 | + " </tr>\n", |
| 325 | + " <tr>\n", |
| 326 | + " <th>1</th>\n", |
| 327 | + " <td>1</td>\n", |
| 328 | + " <td>2</td>\n", |
| 329 | + " <td>0</td>\n", |
| 330 | + " </tr>\n", |
| 331 | + " <tr>\n", |
| 332 | + " <th>2</th>\n", |
| 333 | + " <td>0</td>\n", |
| 334 | + " <td>1</td>\n", |
| 335 | + " <td>0</td>\n", |
| 336 | + " </tr>\n", |
| 337 | + " <tr>\n", |
| 338 | + " <th>3</th>\n", |
| 339 | + " <td>1</td>\n", |
| 340 | + " <td>2</td>\n", |
| 341 | + " <td>0</td>\n", |
| 342 | + " </tr>\n", |
| 343 | + " <tr>\n", |
| 344 | + " <th>4</th>\n", |
| 345 | + " <td>0</td>\n", |
| 346 | + " <td>1</td>\n", |
| 347 | + " <td>0</td>\n", |
| 348 | + " </tr>\n", |
| 349 | + " </tbody>\n", |
| 350 | + "</table>\n", |
| 351 | + "</div>" |
| 352 | + ], |
| 353 | + "text/plain": [ |
| 354 | + " Gender Grade Employed\n", |
| 355 | + "0 0 1 1\n", |
| 356 | + "1 1 2 0\n", |
| 357 | + "2 0 1 0\n", |
| 358 | + "3 1 2 0\n", |
| 359 | + "4 0 1 0" |
| 360 | + ] |
| 361 | + }, |
| 362 | + "execution_count": 25, |
| 363 | + "metadata": {}, |
| 364 | + "output_type": "execute_result" |
| 365 | + } |
| 366 | + ], |
| 367 | + "source": [ |
| 368 | + "df_categorical.head()" |
| 369 | + ] |
| 370 | + } |
| 371 | + ], |
| 372 | + "metadata": { |
| 373 | + "kernelspec": { |
| 374 | + "display_name": "Python 3", |
| 375 | + "language": "python", |
| 376 | + "name": "python3" |
| 377 | + }, |
| 378 | + "language_info": { |
| 379 | + "codemirror_mode": { |
| 380 | + "name": "ipython", |
| 381 | + "version": 3 |
| 382 | + }, |
| 383 | + "file_extension": ".py", |
| 384 | + "mimetype": "text/x-python", |
| 385 | + "name": "python", |
| 386 | + "nbconvert_exporter": "python", |
| 387 | + "pygments_lexer": "ipython3", |
| 388 | + "version": "3.6.4" |
| 389 | + } |
| 390 | + }, |
| 391 | + "nbformat": 4, |
| 392 | + "nbformat_minor": 2 |
| 393 | +} |
0 commit comments