In [56]:
# Regression Examples

import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_squared_error

# Sample dataset
data = {
    'Size': [1500, 1600, 1700, 1800, 1900],
    'Bedrooms': [3, 3, 4, 4, 5],
    'Age': [10, 15, 20, 25, 30],
    'Price': [300000, 320000, 340000, 360000, 380000]
}

# Create DataFrame
df = pd.DataFrame(data)

# Features and target variable
X = df[['Size', 'Bedrooms', 'Age']]
y = df['Price']

# Split the dataset into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Create and train the model
model = LinearRegression()
model.fit(X_train, y_train)

# Make predictions
y_pred = model.predict(X_test)

# Evaluate the model
mse = mean_squared_error(y_test, y_pred)
print(f"Mean Squared Error: {mse}")

# Display coefficients
print(f"Coefficients: {model.coef_}")
print(f"Intercept: {model.intercept_}")


Mean Squared Error: 3.3881317890172014e-21
Coefficients: [1.99501247e+02 1.42108547e-14 9.97506234e+00]
Intercept: 648.3790523692151


In [54]:
# Classification example
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, confusion_matrix

# Sample dataset
data = {
    'Number of Words': [100, 50, 200, 300, 150],
    'Number of Links': [3, 1, 5, 10, 2],
    'Contains Free': [1, 0, 1, 1, 0],  # 1 for Yes, 0 for No
    'Spam': [1, 0, 1, 1, 0]
}

# Create DataFrame
df = pd.DataFrame(data)

# Features and target variable
X = df[['Number of Words', 'Number of Links', 'Contains Free']]
y = df['Spam']

# Split the dataset into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Create and train the model
model = LogisticRegression()
model.fit(X_train, y_train)

# Make predictions
y_pred = model.predict(X_test)

# Evaluate the model
accuracy = accuracy_score(y_test, y_pred)
conf_matrix = confusion_matrix(y_test, y_pred)

print(f"Accuracy: {accuracy}")
print("Confusion Matrix:")
print(conf_matrix)


Accuracy: 0.0
Confusion Matrix:
[[0 1]
 [0 0]]


In [55]:
import numpy as np

# Setting a random seed for reproducibility
rnd = np.random.RandomState(seed=123)

# Generating a random array
X = rnd.uniform(low=0.0, high=1.0, size=(3, 5))  # a 3 x 5 array
print(X)

print('='*40)

rnd1 = np.random.RandomState()  # No seed set
random_numbers1 = rnd1.uniform(low=0.0, high=1.0, size=5)
print(random_numbers1)

print('='*40)

rnd2 = np.random.RandomState()  # No seed set again
random_numbers2 = rnd2.uniform(low=0.0, high=1.0, size=5)
print(random_numbers2)


[[0.69646919 0.28613933 0.22685145 0.55131477 0.71946897]
 [0.42310646 0.9807642  0.68482974 0.4809319  0.39211752]
 [0.34317802 0.72904971 0.43857224 0.0596779  0.39804426]]
[0.58129264 0.77532729 0.7723126  0.91155981 0.52103339]
[0.97883085 0.0181277  0.83094938 0.40239485 0.0247753 ]


In [53]:
import numpy as np
y = np.linspace(0, 12, 5)
print(y)

# Turning the row vector into a column vector
print(y[:, np.newaxis])

[ 0.  3.  6.  9. 12.]
[[ 0.]
 [ 3.]
 [ 6.]
 [ 9.]
 [12.]]


In [58]:
import numpy as np
# Getting the shape or reshaping an array
# Generating a random array
rnd = np.random.RandomState(seed=123)
X = rnd.uniform(low=0.0, high=1.0, size=(3, 5))  # a 3 x 5 array
print(X.shape)
print(X.reshape(5, 3))

(3, 5)
[[0.69646919 0.28613933 0.22685145]
 [0.55131477 0.71946897 0.42310646]
 [0.9807642  0.68482974 0.4809319 ]
 [0.39211752 0.34317802 0.72904971]
 [0.43857224 0.0596779  0.39804426]]


In [61]:
# Indexing by an array of integers (fancy indexing)
indices = np.array([3, 1, 0])
print(indices)
X[:, indices]

[3 1 0]


array([[0.55131477, 0.28613933, 0.69646919],
       [0.4809319 , 0.9807642 , 0.42310646],
       [0.0596779 , 0.72904971, 0.34317802]])

In [59]:
a = 'lorem10orem 10orem 10orem 10orem 10orem 10orem 10orem 10orem 10orem 10orem'
print(a[:20])

lorem10orem 10orem 1


In [57]:
import pandas as pd

# Create a DataFrame
data = {
    'Name': ['Alice', 'Bob', 'Carol'],
    'Age': [30, 25, 28],
    'Country': ['USA', 'Canada', 'UK']
}
df = pd.DataFrame(data)

# Accessing columns
print(df['Name'])  # Prints the 'Name' column

print('='*40)

print(df[['Name', 'Age']])  # Prints the 'Name' and 'Age' columns

print('='*40)

# Accessing rows by index
print(df.iloc[0])  # Prints the first row (Alice's information)

print('='*40)

print(df.iloc[1:3])  # Prints the second and third rows (Bob's and Carol's information)

print('='*40)

# Accessing specific elements
print(df.at[0, 'Name'])  # Prints 'Alice'

print('='*40)

print(df.loc[1, 'Age'])  # Prints 25 (Bob's age)


0    Alice
1      Bob
2    Carol
Name: Name, dtype: object
    Name  Age
0  Alice   30
1    Bob   25
2  Carol   28
Name       Alice
Age           30
Country      USA
Name: 0, dtype: object
    Name  Age Country
1    Bob   25  Canada
2  Carol   28      UK
Alice
25


In [60]:
print(df[0:0]) # in faghat dare row haro slice mikone 
print('='*40)
"""
df.loc is used for label-based indexing. It allows you to access rows and columns by their labels (names).
"""

print(df.loc[0])  # Prints the first row (Alice's information)

print('='*40)

print(df.loc[0:1]) # Prints the first and second rows (Alice's and Bob's information)

Empty DataFrame
Columns: [Name, Age, Country]
Index: []
Name       Alice
Age           30
Country      USA
Name: 0, dtype: object
    Name  Age Country
0  Alice   30     USA
1    Bob   25  Canada


In [63]:
print(df.loc[0, 'Name'])  # Prints 'Alice'

print('='*40)

print(df.loc[0:1, ['Name', 'Age']])  # Prints the 'Name' and 'Age' columns for the first two rows


Alice
    Name  Age
0  Alice   30
1    Bob   25


In [66]:
print(df.loc[df['Age'] > 25])  # Prints rows where 'Age' is greater than 25


    Name  Age Country
0  Alice   30     USA
2  Carol   28      UK


In [62]:
"""
df.at is used to access a single value for a row/column label pair.
 It is very fast for scalar access and should be used when you want to get or set a single value in a DataFrame.
"""

print(df.at[0, 'Name'])  # Prints 'Alice'

Alice


In [64]:
df.at[0, 'Name'] = 'Alicia'
print(df.at[0, 'Name'])  # Prints 'Alicia'


Alicia


Summary of Differences

    df.loc(location):
        Label-based indexing.
        Can access multiple rows and columns.
        Can use boolean arrays for indexing.
        Syntax: df.loc[row_labels, column_labels].

    df.iloc(integer location):
        Integer-based (positional) indexing.
        Can access multiple rows and columns.
        Syntax: df.iloc[row_positions, column_positions].

    df.at:
        Used for accessing a single scalar value.
        Faster for accessing single elements compared to df.loc and df.iloc.
        Syntax: df.at[row_label, column_label].

In [65]:
import pandas as pd

data = {
    'Name': ['Alice', 'Bob', 'Carol'],
    'Age': [30, 25, 28],
    'Country': ['USA', 'Canada', 'UK']
}
df = pd.DataFrame(data)

# Using df.loc to access multiple rows and columns
print(df.loc[0:2, ['Name', 'Age']])  # Prints 'Name' and 'Age' columns for the first three rows

# Using df.iloc to access multiple rows and columns by integer position
print(df.iloc[0:2, [0, 1]])  # Prints 'Name' and 'Age' columns for the first two rows

# Using df.at to access a single value
print(df.at[1, 'Country'])  # Prints 'Canada'

# Using df.at to set a single value
df.at[1, 'Country'] = 'France'
print(df.at[1, 'Country'])  # Prints 'France'


    Name  Age
0  Alice   30
1    Bob   25
2  Carol   28
    Name  Age
0  Alice   30
1    Bob   25
Canada
France


In [69]:
import pandas as pd

data = {
    'Name': ['Alice', 'Bob', 'Carol'],
    'Age': [30, 25, 28],
    'Country': ['USA', 'Canada', 'UK']
}
df = pd.DataFrame(data)

print(df[0:1])

print('='*40)

print(df.loc[0:1])


    Name  Age Country
0  Alice   30     USA
    Name  Age Country
0  Alice   30     USA
1    Bob   25  Canada


Rows represent individual records or data points.
Columns represent different attributes or features of the data.


df[0:1]

    Purpose: This is primarily used for row slicing by integer position.
    Behavior: When you use df[0:1], it slices the DataFrame to return the rows from index 0 to 1 (excluding 1).


df.loc[0:1]

    Purpose: This is used for label-based indexing, which can include slicing by index labels.
    Behavior: When you use df.loc[0:1], it slices the DataFrame to return the rows with labels 0 and 1 (inclusive of both).

Summary

    Use df[0:1] for slicing rows by integer position.
    Use df.loc[0:1] for slicing rows by index labels.
    Use df.loc[:, 'col1':'col2'] for slicing columns by their labels.


index labels :
In pandas, an index label refers to the labels or names assigned to the rows in a DataFrame. By default, these are integers starting from 0, but you can customize them to be any labels you want, such as strings or dates.
When you create a DataFrame without specifying an index, pandas assigns default integer index labels starting from 0.

    Name  Age Country
0  Alice   30     USA
1    Bob   25  Canada
2  Carol   28      UK

Here, the index labels are 0, 1, and 2.


# Loc is best
df.loc: Access or set groups of rows and columns by labels or boolean arrays.

In [3]:
import pandas as pd

data = {
    'Name': ['Alice', 'Bob', 'Carol'],
    'Age': [30, 25, 28],
    'Country': ['USA', 'Canada', 'UK']
}
df = pd.DataFrame(data)

print(df.keys())

print('='*40)

print(df.values)
# print(df.values())

print('='*40)

# Access the row with label 0
print(df.loc[0])

print('='*40)

# Access rows with labels 0 and 1
print(df.loc[0:1])


print('='*40)

# Access the first row and all columns
print(df.loc[0, :])

print('='*40)

# Access the first row and columns from 'Name' to 'Country'
print(df.loc[0, 'Name':'Country'])


print('='*40)


# Access all rows and columns from 'Name' to 'Country'
print(df.loc[:, 'Name':'Country'])

print('='*40)


# Access the 'Name' column for the row with label 0
print(df.loc[0, 'Name'])

print('='*40)

# Access the 'Name' and 'Age' columns for rows with labels 0 and 1
print(df.loc[0:1, ['Name', 'Age']])


print('='*40)

# Access all rows for the 'Name' column
print(df.loc[:, 'Name'])

print('='*40)

# Access all rows for the 'Name' and 'Age' columns
print(df.loc[:, ['Name', 'Age']])

print('='*40)

# Accessing Rows Using Boolean Indexing
# Access rows where 'Age' is greater than 25
print(df.loc[df['Age'] > 25])


print('='*40)

# Set the 'Country' value for the row with label 1
df.loc[1, 'Country'] = 'France'
print(df)


print('='*40)

# 7. Selecting Rows and Columns with Conditions
# Access 'Name' and 'Country' columns for rows where 'Age' is greater than 25
print(df.loc[df['Age'] > 25, ['Name', 'Country']])

print('='*40)


Index(['Name', 'Age', 'Country'], dtype='object')
[['Alice' 30 'USA']
 ['Bob' 25 'Canada']
 ['Carol' 28 'UK']]
Name       Alice
Age           30
Country      USA
Name: 0, dtype: object
    Name  Age Country
0  Alice   30     USA
1    Bob   25  Canada
Name       Alice
Age           30
Country      USA
Name: 0, dtype: object
Name       Alice
Age           30
Country      USA
Name: 0, dtype: object
    Name  Age Country
0  Alice   30     USA
1    Bob   25  Canada
2  Carol   28      UK
Alice
    Name  Age
0  Alice   30
1    Bob   25
0    Alice
1      Bob
2    Carol
Name: Name, dtype: object
    Name  Age
0  Alice   30
1    Bob   25
2  Carol   28
    Name  Age Country
0  Alice   30     USA
2  Carol   28      UK
    Name  Age Country
0  Alice   30     USA
1    Bob   25  France
2  Carol   28      UK
    Name Country
0  Alice     USA
2  Carol      UK
