# Linear regression for **text length**, given an `html <div>` of <u>known width, height and given the text style</u>

In [1]:
import pandas as pd
from sklearn import linear_model
from sklearn.model_selection import train_test_split

import matplotlib.pyplot as plt
%matplotlib widget

# from mpl_toolkits import mplot3d
# import seaborn as sns
# sns.set_style("darkgrid")


In [2]:
df = pd.io.json.read_json("length.json")

df = df.drop_duplicates()
df = df.drop(df[df.w > 2000].index)
df = df.drop(df[df.h > 2000].index)

# print(df.head())
# print()
# print(df.describe())
# print()
# print(df.info());
# print()


In [3]:
plt.figure(figsize=(12, 8))
ax = plt.axes(projection='3d')
plt.title("Plot of dataset")
ax.scatter(df["w"], df["h"], df["l"], c=df["l"], cmap="viridis")
ax.set_xlabel('box width')
ax.set_ylabel('box height')
ax.set_zlabel("text length")


Canvas(toolbar=Toolbar(toolitems=[('Home', 'Reset original view', 'home', 'home'), ('Back', 'Back to previous …

Text(0.5, 0, 'text length')

In [4]:
df["area"] = df["w"]*df["h"]


In [5]:
wh_train, wh_test, l_train, l_test = train_test_split(
    df[["w", "h", "area"]].values, df["l"].values, test_size=1/4, random_state=1)


In [6]:
lin_reg = linear_model.LinearRegression()  # build the object LinearRegression
lin_reg.fit(wh_train, l_train)  # estimate the LS coefficients
print("Intercept:", lin_reg.intercept_)
print("Least-Squares Coefficients:", lin_reg.coef_)
# predict output values on training set
# prediction_training = lin_reg.predict(wh_train)
# predict output values on test set
# prediction_test = lin_reg.predict(wh_test)
print("Measure on training data:", lin_reg.score(wh_train, l_train))
print("Measure on test data:", lin_reg.score(wh_test, l_test))


Intercept: -8.126482949847684
Least-Squares Coefficients: [-0.13599902 -0.2172954   0.00569047]
Measure on training data: 0.9953477289607526
Measure on test data: 0.9638037200307745


In [7]:
zs = df["w"]*lin_reg.coef_[0]+df["h"]*lin_reg.coef_[1] + \
    df["area"]*lin_reg.coef_[2]+lin_reg.intercept_

plt.figure(figsize=(12, 8))
ax = plt.axes(projection='3d')
plt.title("Plot of prediction plane")
ax.scatter(df["w"], df["h"], df["l"], c=df["l"], cmap="viridis")
ax.plot_trisurf(df["w"], df["h"], zs)
ax.set_xlabel('box width')
ax.set_ylabel('box height')
ax.set_zlabel("text length")


Canvas(toolbar=Toolbar(toolitems=[('Home', 'Reset original view', 'home', 'home'), ('Back', 'Back to previous …

Text(0.5, 0, 'text length')